rpc.proto 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. syntax = "proto3";
  2. package rpcpb;
  3. import "github.com/gogo/protobuf/gogoproto/gogo.proto";
  4. option (gogoproto.marshaler_all) = true;
  5. option (gogoproto.sizer_all) = true;
  6. option (gogoproto.unmarshaler_all) = true;
  7. option (gogoproto.goproto_getters_all) = false;
  8. message Request {
  9. Operation Operation = 1;
  10. // Member contains the same Member object from tester configuration.
  11. Member Member = 2;
  12. // Tester contains tester configuration.
  13. Tester Tester = 3;
  14. }
  15. // SnapshotInfo contains SAVE_SNAPSHOT request results.
  16. message SnapshotInfo {
  17. string MemberName = 1;
  18. repeated string MemberClientURLs = 2;
  19. string SnapshotPath = 3;
  20. string SnapshotFileSize = 4;
  21. string SnapshotTotalSize = 5;
  22. int64 SnapshotTotalKey = 6;
  23. int64 SnapshotHash = 7;
  24. int64 SnapshotRevision = 8;
  25. string Took = 9;
  26. }
  27. message Response {
  28. bool Success = 1;
  29. string Status = 2;
  30. // Member contains the same Member object from tester request.
  31. Member Member = 3;
  32. // SnapshotInfo contains SAVE_SNAPSHOT request results.
  33. SnapshotInfo SnapshotInfo = 4;
  34. }
  35. service Transport {
  36. rpc Transport(stream Request) returns (stream Response) {}
  37. }
  38. message Member {
  39. // EtcdExec is the executable etcd binary path in agent server.
  40. string EtcdExec = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec\""];
  41. // AgentAddr is the agent HTTP server address.
  42. string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
  43. // FailpointHTTPAddr is the agent's failpoints HTTP server address.
  44. string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
  45. // BaseDir is the base directory where all logs and etcd data are stored.
  46. string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
  47. // EtcdClientProxy is true when client traffic needs to be proxied.
  48. // If true, listen client URL port must be different than advertise client URL port.
  49. bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
  50. // EtcdPeerProxy is true when peer traffic needs to be proxied.
  51. // If true, listen peer URL port must be different than advertise peer URL port.
  52. bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
  53. // EtcdClientEndpoint is the etcd client endpoint.
  54. string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
  55. // Etcd defines etcd binary configuration flags.
  56. Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
  57. // EtcdOnSnapshotRestore defines one-time use configuration during etcd
  58. // snapshot recovery process.
  59. Etcd EtcdOnSnapshotRestore = 303;
  60. // ClientCertData contains cert file contents from this member's etcd server.
  61. string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
  62. string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
  63. // ClientKeyData contains key file contents from this member's etcd server.
  64. string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
  65. string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
  66. // ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
  67. string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
  68. string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
  69. // PeerCertData contains cert file contents from this member's etcd server.
  70. string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
  71. string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
  72. // PeerKeyData contains key file contents from this member's etcd server.
  73. string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
  74. string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
  75. // PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
  76. string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
  77. string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
  78. // SnapshotPath is the snapshot file path to store or restore from.
  79. string SnapshotPath = 601 [(gogoproto.moretags) = "yaml:\"snapshot-path\""];
  80. // SnapshotInfo contains last SAVE_SNAPSHOT request results.
  81. SnapshotInfo SnapshotInfo = 602;
  82. }
  83. message Tester {
  84. string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""];
  85. string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""];
  86. string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""];
  87. // DelayLatencyMsRv is the delay latency in milliseconds,
  88. // to inject to simulated slow network.
  89. uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
  90. // DelayLatencyMsRv is the delay latency random variable in milliseconds.
  91. uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
  92. // UpdatedDelayLatencyMs is the update delay latency in milliseconds,
  93. // to inject to simulated slow network. It's the final latency to apply,
  94. // in case the latency numbers are randomly generated from given delay latency field.
  95. uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
  96. // RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
  97. int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
  98. // ExitOnCaseFail is true, then exit tester on first failure.
  99. bool ExitOnCaseFail = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
  100. // EnablePprof is true to enable profiler.
  101. bool EnablePprof = 23 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
  102. // CaseDelayMs is the delay duration after failure is injected.
  103. // Useful when triggering snapshot or no-op failure cases.
  104. uint32 CaseDelayMs = 31 [(gogoproto.moretags) = "yaml:\"case-delay-ms\""];
  105. // CaseShuffle is true to randomize failure injecting order.
  106. bool CaseShuffle = 32 [(gogoproto.moretags) = "yaml:\"case-shuffle\""];
  107. // Cases is the selected test cases to schedule.
  108. // If empty, run all failure cases.
  109. repeated string Cases = 33 [(gogoproto.moretags) = "yaml:\"cases\""];
  110. // FailpointCommands is the list of "gofail" commands
  111. // (e.g. panic("etcd-tester"),1*sleep(1000).
  112. repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
  113. // RunnerExecPath is a path of etcd-runner binary.
  114. string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
  115. // ExternalExecPath is a path of script for enabling/disabling an external fault injector.
  116. string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
  117. // Stressers is the list of stresser types:
  118. // KV, LEASE, ELECTION_RUNNER, WATCH_RUNNER, LOCK_RACER_RUNNER, LEASE_RUNNER.
  119. repeated Stresser Stressers = 101 [(gogoproto.moretags) = "yaml:\"stressers\""];
  120. // Checkers is the list of consistency checker types:
  121. // KV_HASH, LEASE_EXPIRE, NO_CHECK, RUNNER.
  122. // Leave empty to skip consistency checks.
  123. repeated string Checkers = 102 [(gogoproto.moretags) = "yaml:\"checkers\""];
  124. // StressKeySize is the size of each small key written into etcd.
  125. int32 StressKeySize = 201 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
  126. // StressKeySizeLarge is the size of each large key written into etcd.
  127. int32 StressKeySizeLarge = 202 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
  128. // StressKeySuffixRange is the count of key range written into etcd.
  129. // Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
  130. int32 StressKeySuffixRange = 203 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
  131. // StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
  132. // Stress keys are created with "fmt.Sprintf("/k%03d", i)".
  133. int32 StressKeySuffixRangeTxn = 204 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
  134. // StressKeyTxnOps is the number of operations per a transaction (max 64).
  135. int32 StressKeyTxnOps = 205 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
  136. // StressClients is the number of concurrent stressing clients
  137. // with "one" shared TCP connection.
  138. int32 StressClients = 301 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
  139. // StressQPS is the maximum number of stresser requests per second.
  140. int32 StressQPS = 302 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
  141. }
  142. enum StresserType {
  143. KV_WRITE_SMALL = 0;
  144. KV_WRITE_LARGE = 1;
  145. KV_READ_ONE_KEY = 2;
  146. KV_READ_RANGE = 3;
  147. KV_DELETE_ONE_KEY = 4;
  148. KV_DELETE_RANGE = 5;
  149. KV_TXN_WRITE_DELETE = 6;
  150. LEASE = 10;
  151. ELECTION_RUNNER = 20;
  152. WATCH_RUNNER = 31;
  153. LOCK_RACER_RUNNER = 41;
  154. LEASE_RUNNER = 51;
  155. }
  156. message Stresser {
  157. string Type = 1 [(gogoproto.moretags) = "yaml:\"type\""];
  158. double Weight = 2 [(gogoproto.moretags) = "yaml:\"weight\""];
  159. }
  160. enum Checker {
  161. KV_HASH = 0;
  162. LEASE_EXPIRE = 1;
  163. RUNNER = 2;
  164. NO_CHECK = 3;
  165. }
  166. message Etcd {
  167. string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
  168. string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
  169. string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
  170. // HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
  171. // Default value is 100, which is 100ms.
  172. int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
  173. // ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
  174. // Default value is 1000, which is 1s.
  175. int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
  176. repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
  177. repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
  178. bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
  179. bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
  180. string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
  181. string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
  182. string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
  183. repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
  184. repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
  185. bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
  186. bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
  187. string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
  188. string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
  189. string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
  190. string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
  191. string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
  192. string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
  193. int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
  194. int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
  195. bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
  196. bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
  197. string Logger = 71 [(gogoproto.moretags) = "yaml:\"logger\""];
  198. // LogOutputs is the log file to store current etcd server logs.
  199. repeated string LogOutputs = 72 [(gogoproto.moretags) = "yaml:\"log-outputs\""];
  200. string LogLevel = 73 [(gogoproto.moretags) = "yaml:\"log-level\""];
  201. }
  202. enum Operation {
  203. // NOT_STARTED is the agent status before etcd first start.
  204. NOT_STARTED = 0;
  205. // INITIAL_START_ETCD is only called to start etcd, the very first time.
  206. INITIAL_START_ETCD = 10;
  207. // RESTART_ETCD is sent to restart killed etcd.
  208. RESTART_ETCD = 11;
  209. // SIGTERM_ETCD pauses etcd process while keeping data directories
  210. // and previous etcd configurations.
  211. SIGTERM_ETCD = 20;
  212. // SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data
  213. // directories to simulate destroying the whole machine.
  214. SIGQUIT_ETCD_AND_REMOVE_DATA = 21;
  215. // SAVE_SNAPSHOT is sent to trigger local member to download its snapshot
  216. // onto its local disk with the specified path from tester.
  217. SAVE_SNAPSHOT = 30;
  218. // RESTORE_RESTART_FROM_SNAPSHOT is sent to trigger local member to
  219. // restore a cluster from existing snapshot from disk, and restart
  220. // an etcd instance from recovered data.
  221. RESTORE_RESTART_FROM_SNAPSHOT = 31;
  222. // RESTART_FROM_SNAPSHOT is sent to trigger local member to restart
  223. // and join an existing cluster that has been recovered from a snapshot.
  224. // Local member joins this cluster with fresh data.
  225. RESTART_FROM_SNAPSHOT = 32;
  226. // SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed,
  227. // thus need to archive etcd data directories.
  228. SIGQUIT_ETCD_AND_ARCHIVE_DATA = 40;
  229. // SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT destroys etcd process,
  230. // etcd data, and agent server.
  231. SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT = 41;
  232. // BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to
  233. // the peer port on target member's peer port.
  234. BLACKHOLE_PEER_PORT_TX_RX = 100;
  235. // UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping.
  236. UNBLACKHOLE_PEER_PORT_TX_RX = 101;
  237. // DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to
  238. // the peer port on target member's peer port.
  239. DELAY_PEER_PORT_TX_RX = 200;
  240. // UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays.
  241. UNDELAY_PEER_PORT_TX_RX = 201;
  242. }
  243. // Case defines various system faults or test case in distributed systems,
  244. // in order to verify correct behavior of etcd servers and clients.
  245. enum Case {
  246. // SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
  247. // but does not delete its data directories on disk for next restart.
  248. // It waits "delay-ms" before recovering this failure.
  249. // The expected behavior is that the follower comes back online
  250. // and rejoins the cluster, and then each member continues to process
  251. // client requests ('Put' request that requires Raft consensus).
  252. SIGTERM_ONE_FOLLOWER = 0;
  253. // SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
  254. // follower but does not delete its data directories on disk for next
  255. // restart. And waits until most up-to-date node (leader) applies the
  256. // snapshot count of entries since the stop operation.
  257. // The expected behavior is that the follower comes back online and
  258. // rejoins the cluster, and then active leader sends snapshot
  259. // to the follower to force it to follow the leader's log.
  260. // As always, after recovery, each member must be able to process
  261. // client requests.
  262. SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
  263. // SIGTERM_LEADER stops the active leader node but does not delete its
  264. // data directories on disk for next restart. Then it waits "delay-ms"
  265. // before recovering this failure, in order to trigger election timeouts.
  266. // The expected behavior is that a new leader gets elected, and the
  267. // old leader comes back online and rejoins the cluster as a follower.
  268. // As always, after recovery, each member must be able to process
  269. // client requests.
  270. SIGTERM_LEADER = 2;
  271. // SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
  272. // but does not delete its data directories on disk for next restart.
  273. // And waits until most up-to-date node ("new" leader) applies the
  274. // snapshot count of entries since the stop operation.
  275. // The expected behavior is that cluster elects a new leader, and the
  276. // old leader comes back online and rejoins the cluster as a follower.
  277. // And it receives the snapshot from the new leader to overwrite its
  278. // store. As always, after recovery, each member must be able to
  279. // process client requests.
  280. SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
  281. // SIGTERM_QUORUM stops majority number of nodes to make the whole cluster
  282. // inoperable but does not delete data directories on stopped nodes
  283. // for next restart. And it waits "delay-ms" before recovering failure.
  284. // The expected behavior is that nodes come back online, thus cluster
  285. // comes back operative as well. As always, after recovery, each member
  286. // must be able to process client requests.
  287. SIGTERM_QUORUM = 4;
  288. // SIGTERM_ALL stops the whole cluster but does not delete data directories
  289. // on disk for next restart. And it waits "delay-ms" before recovering
  290. // this failure.
  291. // The expected behavior is that nodes come back online, thus cluster
  292. // comes back operative as well. As always, after recovery, each member
  293. // must be able to process client requests.
  294. SIGTERM_ALL = 5;
  295. // SIGQUIT_AND_REMOVE_ONE_FOLLOWER stops a randomly chosen follower
  296. // (non-leader), deletes its data directories on disk, and removes
  297. // this member from cluster (membership reconfiguration). On recovery,
  298. // tester adds a new member, and this member joins the existing cluster
  299. // with fresh data. It waits "delay-ms" before recovering this
  300. // failure. This simulates destroying one follower machine, where operator
  301. // needs to add a new member from a fresh machine.
  302. // The expected behavior is that a new member joins the existing cluster,
  303. // and then each member continues to process client requests.
  304. SIGQUIT_AND_REMOVE_ONE_FOLLOWER = 10;
  305. // SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly
  306. // chosen follower, deletes its data directories on disk, and removes
  307. // this member from cluster (membership reconfiguration). On recovery,
  308. // tester adds a new member, and this member joins the existing cluster
  309. // restart. On member remove, cluster waits until most up-to-date node
  310. // (leader) applies the snapshot count of entries since the stop operation.
  311. // This simulates destroying a leader machine, where operator needs to add
  312. // a new member from a fresh machine.
  313. // The expected behavior is that a new member joins the existing cluster,
  314. // and receives a snapshot from the active leader. As always, after
  315. // recovery, each member must be able to process client requests.
  316. SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 11;
  317. // SIGQUIT_AND_REMOVE_LEADER stops the active leader node, deletes its
  318. // data directories on disk, and removes this member from cluster.
  319. // On recovery, tester adds a new member, and this member joins the
  320. // existing cluster with fresh data. It waits "delay-ms" before
  321. // recovering this failure. This simulates destroying a leader machine,
  322. // where operator needs to add a new member from a fresh machine.
  323. // The expected behavior is that a new member joins the existing cluster,
  324. // and then each member continues to process client requests.
  325. SIGQUIT_AND_REMOVE_LEADER = 12;
  326. // SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader,
  327. // deletes its data directories on disk, and removes this member from
  328. // cluster (membership reconfiguration). On recovery, tester adds a new
  329. // member, and this member joins the existing cluster restart. On member
  330. // remove, cluster waits until most up-to-date node (new leader) applies
  331. // the snapshot count of entries since the stop operation. This simulates
  332. // destroying a leader machine, where operator needs to add a new member
  333. // from a fresh machine.
  334. // The expected behavior is that on member remove, cluster elects a new
  335. // leader, and a new member joins the existing cluster and receives a
  336. // snapshot from the newly elected leader. As always, after recovery, each
  337. // member must be able to process client requests.
  338. SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT = 13;
  339. // SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH first
  340. // stops majority number of nodes, deletes data directories on those quorum
  341. // nodes, to make the whole cluster inoperable. Now that quorum and their
  342. // data are totally destroyed, cluster cannot even remove unavailable nodes
  343. // (e.g. 2 out of 3 are lost, so no leader can be elected).
  344. // Let's assume 3-node cluster of node A, B, and C. One day, node A and B
  345. // are destroyed and all their data are gone. The only viable solution is
  346. // to recover from C's latest snapshot.
  347. //
  348. // To simulate:
  349. // 1. Assume node C is the current leader with most up-to-date data.
  350. // 2. Download snapshot from node C, before destroying node A and B.
  351. // 3. Destroy node A and B, and make the whole cluster inoperable.
  352. // 4. Now node C cannot operate either.
  353. // 5. SIGTERM node C and remove its data directories.
  354. // 6. Restore a new seed member from node C's latest snapshot file.
  355. // 7. Add another member to establish 2-node cluster.
  356. // 8. Add another member to establish 3-node cluster.
  357. // 9. Add more if any.
  358. //
  359. // The expected behavior is that etcd successfully recovers from such
  360. // disastrous situation as only 1-node survives out of 3-node cluster,
  361. // new members joins the existing cluster, and previous data from snapshot
  362. // are still preserved after recovery process. As always, after recovery,
  363. // each member must be able to process client requests.
  364. SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH = 14;
  365. // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
  366. // packets from/to the peer port on a randomly chosen follower
  367. // (non-leader), and waits for "delay-ms" until recovery.
  368. // The expected behavior is that once dropping operation is undone,
  369. // each member must be able to process client requests.
  370. BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
  371. // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
  372. // all outgoing/incoming packets from/to the peer port on a randomly
  373. // chosen follower (non-leader), and waits for most up-to-date node
  374. // (leader) applies the snapshot count of entries since the blackhole
  375. // operation.
  376. // The expected behavior is that once packet drop operation is undone,
  377. // the slow follower tries to catch up, possibly receiving the snapshot
  378. // from the active leader. As always, after recovery, each member must
  379. // be able to process client requests.
  380. BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
  381. // BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
  382. // from/to the peer port on the active leader (isolated), and waits for
  383. // "delay-ms" until recovery, in order to trigger election timeout.
  384. // The expected behavior is that after election timeout, a new leader gets
  385. // elected, and once dropping operation is undone, the old leader comes
  386. // back and rejoins the cluster as a follower. As always, after recovery,
  387. // each member must be able to process client requests.
  388. BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
  389. // BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
  390. // outgoing/incoming packets from/to the peer port on the active leader,
  391. // and waits for most up-to-date node (leader) applies the snapshot
  392. // count of entries since the blackhole operation.
  393. // The expected behavior is that cluster elects a new leader, and once
  394. // dropping operation is undone, the old leader comes back and rejoins
  395. // the cluster as a follower. The slow follower tries to catch up, likely
  396. // receiving the snapshot from the new active leader. As always, after
  397. // recovery, each member must be able to process client requests.
  398. BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
  399. // BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
  400. // from/to the peer ports on majority nodes of cluster, thus losing its
  401. // leader and cluster being inoperable. And it waits for "delay-ms"
  402. // until recovery.
  403. // The expected behavior is that once packet drop operation is undone,
  404. // nodes come back online, thus cluster comes back operative. As always,
  405. // after recovery, each member must be able to process client requests.
  406. BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
  407. // BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
  408. // from/to the peer ports on all nodes, thus making cluster totally
  409. // inoperable. It waits for "delay-ms" until recovery.
  410. // The expected behavior is that once packet drop operation is undone,
  411. // nodes come back online, thus cluster comes back operative. As always,
  412. // after recovery, each member must be able to process client requests.
  413. BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
  414. // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
  415. // from/to the peer port on a randomly chosen follower (non-leader).
  416. // It waits for "delay-ms" until recovery.
  417. // The expected behavior is that once packet delay operation is undone,
  418. // the follower comes back and tries to catch up with latest changes from
  419. // cluster. And as always, after recovery, each member must be able to
  420. // process client requests.
  421. DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
  422. // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
  423. // packets from/to the peer port on a randomly chosen follower
  424. // (non-leader) with a randomized time duration (thus isolated). It
  425. // waits for "delay-ms" until recovery.
  426. // The expected behavior is that once packet delay operation is undone,
  427. // each member must be able to process client requests.
  428. RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
  429. // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
  430. // outgoing/incoming packets from/to the peer port on a randomly chosen
  431. // follower (non-leader), and waits for most up-to-date node (leader)
  432. // applies the snapshot count of entries since the delay operation.
  433. // The expected behavior is that the delayed follower gets isolated
  434. // and behind the current active leader, and once delay operation is undone,
  435. // the slow follower comes back and catches up possibly receiving snapshot
  436. // from the active leader. As always, after recovery, each member must be
  437. // able to process client requests.
  438. DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
  439. // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
  440. // outgoing/incoming packets from/to the peer port on a randomly chosen
  441. // follower (non-leader) with a randomized time duration, and waits for
  442. // most up-to-date node (leader) applies the snapshot count of entries
  443. // since the delay operation.
  444. // The expected behavior is that the delayed follower gets isolated
  445. // and behind the current active leader, and once delay operation is undone,
  446. // the slow follower comes back and catches up, possibly receiving a
  447. // snapshot from the active leader. As always, after recovery, each member
  448. // must be able to process client requests.
  449. RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
  450. // DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
  451. // the peer port on the active leader. And waits for "delay-ms" until
  452. // recovery.
  453. // The expected behavior is that cluster may elect a new leader, and
  454. // once packet delay operation is undone, the (old) leader comes back
  455. // and tries to catch up with latest changes from cluster. As always,
  456. // after recovery, each member must be able to process client requests.
  457. DELAY_PEER_PORT_TX_RX_LEADER = 204;
  458. // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
  459. // from/to the peer port on the active leader with a randomized time
  460. // duration. And waits for "delay-ms" until recovery.
  461. // The expected behavior is that cluster may elect a new leader, and
  462. // once packet delay operation is undone, the (old) leader comes back
  463. // and tries to catch up with latest changes from cluster. As always,
  464. // after recovery, each member must be able to process client requests.
  465. RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
  466. // DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
  467. // outgoing/incoming packets from/to the peer port on the active leader,
  468. // and waits for most up-to-date node (current or new leader) applies the
  469. // snapshot count of entries since the delay operation.
  470. // The expected behavior is that cluster may elect a new leader, and
  471. // the old leader gets isolated and behind the current active leader,
  472. // and once delay operation is undone, the slow follower comes back
  473. // and catches up, likely receiving a snapshot from the active leader.
  474. // As always, after recovery, each member must be able to process client
  475. // requests.
  476. DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
  477. // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
  478. // outgoing/incoming packets from/to the peer port on the active leader,
  479. // with a randomized time duration. And it waits for most up-to-date node
  480. // (current or new leader) applies the snapshot count of entries since the
  481. // delay operation.
  482. // The expected behavior is that cluster may elect a new leader, and
  483. // the old leader gets isolated and behind the current active leader,
  484. // and once delay operation is undone, the slow follower comes back
  485. // and catches up, likely receiving a snapshot from the active leader.
  486. // As always, after recovery, each member must be able to process client
  487. // requests.
  488. RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
  489. // DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
  490. // the peer ports on majority nodes of cluster. And it waits for
  491. // "delay-ms" until recovery, likely to trigger election timeouts.
  492. // The expected behavior is that cluster may elect a new leader, while
  493. // quorum of nodes struggle with slow networks, and once delay operation
  494. // is undone, nodes come back and cluster comes back operative. As always,
  495. // after recovery, each member must be able to process client requests.
  496. DELAY_PEER_PORT_TX_RX_QUORUM = 208;
  497. // RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
  498. // from/to the peer ports on majority nodes of cluster, with randomized
  499. // time durations. And it waits for "delay-ms" until recovery, likely
  500. // to trigger election timeouts.
  501. // The expected behavior is that cluster may elect a new leader, while
  502. // quorum of nodes struggle with slow networks, and once delay operation
  503. // is undone, nodes come back and cluster comes back operative. As always,
  504. // after recovery, each member must be able to process client requests.
  505. RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
  506. // DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
  507. // peer ports on all nodes. And it waits for "delay-ms" until recovery,
  508. // likely to trigger election timeouts.
  509. // The expected behavior is that cluster may become totally inoperable,
  510. // struggling with slow networks across the whole cluster. Once delay
  511. // operation is undone, nodes come back and cluster comes back operative.
  512. // As always, after recovery, each member must be able to process client
  513. // requests.
  514. DELAY_PEER_PORT_TX_RX_ALL = 210;
  515. // RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
  516. // from/to the peer ports on all nodes, with randomized time durations.
  517. // And it waits for "delay-ms" until recovery, likely to trigger
  518. // election timeouts.
  519. // The expected behavior is that cluster may become totally inoperable,
  520. // struggling with slow networks across the whole cluster. Once delay
  521. // operation is undone, nodes come back and cluster comes back operative.
  522. // As always, after recovery, each member must be able to process client
  523. // requests.
  524. RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
  525. // NO_FAIL_WITH_STRESS stops injecting failures while testing the
  526. // consistency and correctness under pressure loads, for the duration of
  527. // "delay-ms". Goal is to ensure cluster be still making progress
  528. // on recovery, and verify system does not deadlock following a sequence
  529. // of failure injections.
  530. // The expected behavior is that cluster remains fully operative in healthy
  531. // condition. As always, after recovery, each member must be able to process
  532. // client requests.
  533. NO_FAIL_WITH_STRESS = 300;
  534. // NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS neither injects failures nor
  535. // sends stressig client requests to the cluster, for the duration of
  536. // "delay-ms". Goal is to ensure cluster be still making progress
  537. // on recovery, and verify system does not deadlock following a sequence
  538. // of failure injections.
  539. // The expected behavior is that cluster remains fully operative in healthy
  540. // condition, and clients requests during liveness period succeed without
  541. // errors.
  542. // Note: this is how Google Chubby does failure injection testing
  543. // https://static.googleusercontent.com/media/research.google.com/en//archive/paxos_made_live.pdf.
  544. NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
  545. // FAILPOINTS injects failpoints to etcd server runtime, triggering panics
  546. // in critical code paths.
  547. FAILPOINTS = 400;
  548. // EXTERNAL runs external failure injection scripts.
  549. EXTERNAL = 500;
  550. }