cluster_test.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package integration
  15. import (
  16. "context"
  17. "fmt"
  18. "log"
  19. "math/rand"
  20. "os"
  21. "strconv"
  22. "strings"
  23. "testing"
  24. "time"
  25. "go.etcd.io/etcd/client"
  26. "go.etcd.io/etcd/etcdserver"
  27. "go.etcd.io/etcd/pkg/testutil"
  28. )
  29. func init() {
  30. // open microsecond-level time log for integration test debugging
  31. log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile)
  32. if t := os.Getenv("ETCD_ELECTION_TIMEOUT_TICKS"); t != "" {
  33. if i, err := strconv.ParseInt(t, 10, 64); err == nil {
  34. electionTicks = int(i)
  35. }
  36. }
  37. }
  38. func TestClusterOf1(t *testing.T) { testCluster(t, 1) }
  39. func TestClusterOf3(t *testing.T) { testCluster(t, 3) }
  40. func testCluster(t *testing.T, size int) {
  41. defer testutil.AfterTest(t)
  42. c := NewCluster(t, size)
  43. c.Launch(t)
  44. defer c.Terminate(t)
  45. clusterMustProgress(t, c.Members)
  46. }
  47. func TestTLSClusterOf3(t *testing.T) {
  48. defer testutil.AfterTest(t)
  49. c := NewClusterByConfig(t, &ClusterConfig{Size: 3, PeerTLS: &testTLSInfo})
  50. c.Launch(t)
  51. defer c.Terminate(t)
  52. clusterMustProgress(t, c.Members)
  53. }
  54. func TestClusterOf1UsingDiscovery(t *testing.T) { testClusterUsingDiscovery(t, 1) }
  55. func TestClusterOf3UsingDiscovery(t *testing.T) { testClusterUsingDiscovery(t, 3) }
  56. func testClusterUsingDiscovery(t *testing.T, size int) {
  57. defer testutil.AfterTest(t)
  58. dc := NewCluster(t, 1)
  59. dc.Launch(t)
  60. defer dc.Terminate(t)
  61. // init discovery token space
  62. dcc := MustNewHTTPClient(t, dc.URLs(), nil)
  63. dkapi := client.NewKeysAPI(dcc)
  64. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  65. if _, err := dkapi.Create(ctx, "/_config/size", fmt.Sprintf("%d", size)); err != nil {
  66. t.Fatal(err)
  67. }
  68. cancel()
  69. c := NewClusterByConfig(
  70. t,
  71. &ClusterConfig{Size: size, DiscoveryURL: dc.URL(0) + "/v2/keys"},
  72. )
  73. c.Launch(t)
  74. defer c.Terminate(t)
  75. clusterMustProgress(t, c.Members)
  76. }
  77. func TestTLSClusterOf3UsingDiscovery(t *testing.T) {
  78. defer testutil.AfterTest(t)
  79. dc := NewCluster(t, 1)
  80. dc.Launch(t)
  81. defer dc.Terminate(t)
  82. // init discovery token space
  83. dcc := MustNewHTTPClient(t, dc.URLs(), nil)
  84. dkapi := client.NewKeysAPI(dcc)
  85. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  86. if _, err := dkapi.Create(ctx, "/_config/size", fmt.Sprintf("%d", 3)); err != nil {
  87. t.Fatal(err)
  88. }
  89. cancel()
  90. c := NewClusterByConfig(t,
  91. &ClusterConfig{
  92. Size: 3,
  93. PeerTLS: &testTLSInfo,
  94. DiscoveryURL: dc.URL(0) + "/v2/keys"},
  95. )
  96. c.Launch(t)
  97. defer c.Terminate(t)
  98. clusterMustProgress(t, c.Members)
  99. }
  100. func TestDoubleClusterSizeOf1(t *testing.T) { testDoubleClusterSize(t, 1) }
  101. func TestDoubleClusterSizeOf3(t *testing.T) { testDoubleClusterSize(t, 3) }
  102. func testDoubleClusterSize(t *testing.T, size int) {
  103. defer testutil.AfterTest(t)
  104. c := NewCluster(t, size)
  105. c.Launch(t)
  106. defer c.Terminate(t)
  107. for i := 0; i < size; i++ {
  108. c.AddMember(t)
  109. }
  110. clusterMustProgress(t, c.Members)
  111. }
  112. func TestDoubleTLSClusterSizeOf3(t *testing.T) {
  113. defer testutil.AfterTest(t)
  114. c := NewClusterByConfig(t, &ClusterConfig{Size: 3, PeerTLS: &testTLSInfo})
  115. c.Launch(t)
  116. defer c.Terminate(t)
  117. for i := 0; i < 3; i++ {
  118. c.AddMember(t)
  119. }
  120. clusterMustProgress(t, c.Members)
  121. }
  122. func TestDecreaseClusterSizeOf3(t *testing.T) { testDecreaseClusterSize(t, 3) }
  123. func TestDecreaseClusterSizeOf5(t *testing.T) { testDecreaseClusterSize(t, 5) }
  124. func testDecreaseClusterSize(t *testing.T, size int) {
  125. defer testutil.AfterTest(t)
  126. c := NewCluster(t, size)
  127. c.Launch(t)
  128. defer c.Terminate(t)
  129. // TODO: remove the last but one member
  130. for i := 0; i < size-1; i++ {
  131. id := c.Members[len(c.Members)-1].s.ID()
  132. // may hit second leader election on slow machines
  133. if err := c.removeMember(t, uint64(id)); err != nil {
  134. if strings.Contains(err.Error(), "no leader") {
  135. t.Logf("got leader error (%v)", err)
  136. i--
  137. continue
  138. }
  139. t.Fatal(err)
  140. }
  141. c.waitLeader(t, c.Members)
  142. }
  143. clusterMustProgress(t, c.Members)
  144. }
  145. func TestForceNewCluster(t *testing.T) {
  146. c := NewCluster(t, 3)
  147. c.Launch(t)
  148. cc := MustNewHTTPClient(t, []string{c.Members[0].URL()}, nil)
  149. kapi := client.NewKeysAPI(cc)
  150. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  151. resp, err := kapi.Create(ctx, "/foo", "bar")
  152. if err != nil {
  153. t.Fatalf("unexpected create error: %v", err)
  154. }
  155. cancel()
  156. // ensure create has been applied in this machine
  157. ctx, cancel = context.WithTimeout(context.Background(), requestTimeout)
  158. if _, err = kapi.Watcher("/foo", &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(ctx); err != nil {
  159. t.Fatalf("unexpected watch error: %v", err)
  160. }
  161. cancel()
  162. c.Members[0].Stop(t)
  163. c.Members[1].Terminate(t)
  164. c.Members[2].Terminate(t)
  165. c.Members[0].ForceNewCluster = true
  166. err = c.Members[0].Restart(t)
  167. if err != nil {
  168. t.Fatalf("unexpected ForceRestart error: %v", err)
  169. }
  170. defer c.Members[0].Terminate(t)
  171. c.waitLeader(t, c.Members[:1])
  172. // use new http client to init new connection
  173. cc = MustNewHTTPClient(t, []string{c.Members[0].URL()}, nil)
  174. kapi = client.NewKeysAPI(cc)
  175. // ensure force restart keep the old data, and new cluster can make progress
  176. ctx, cancel = context.WithTimeout(context.Background(), requestTimeout)
  177. if _, err := kapi.Watcher("/foo", &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(ctx); err != nil {
  178. t.Fatalf("unexpected watch error: %v", err)
  179. }
  180. cancel()
  181. clusterMustProgress(t, c.Members[:1])
  182. }
  183. func TestAddMemberAfterClusterFullRotation(t *testing.T) {
  184. defer testutil.AfterTest(t)
  185. c := NewCluster(t, 3)
  186. c.Launch(t)
  187. defer c.Terminate(t)
  188. // remove all the previous three members and add in three new members.
  189. for i := 0; i < 3; i++ {
  190. c.RemoveMember(t, uint64(c.Members[0].s.ID()))
  191. c.waitLeader(t, c.Members)
  192. c.AddMember(t)
  193. c.waitLeader(t, c.Members)
  194. }
  195. c.AddMember(t)
  196. c.waitLeader(t, c.Members)
  197. clusterMustProgress(t, c.Members)
  198. }
  199. // Ensure we can remove a member then add a new one back immediately.
  200. func TestIssue2681(t *testing.T) {
  201. defer testutil.AfterTest(t)
  202. c := NewCluster(t, 5)
  203. c.Launch(t)
  204. defer c.Terminate(t)
  205. c.RemoveMember(t, uint64(c.Members[4].s.ID()))
  206. c.waitLeader(t, c.Members)
  207. c.AddMember(t)
  208. c.waitLeader(t, c.Members)
  209. clusterMustProgress(t, c.Members)
  210. }
  211. // Ensure we can remove a member after a snapshot then add a new one back.
  212. func TestIssue2746(t *testing.T) { testIssue2746(t, 5) }
  213. // With 3 nodes TestIssue2476 sometimes had a shutdown with an inflight snapshot.
  214. func TestIssue2746WithThree(t *testing.T) { testIssue2746(t, 3) }
  215. func testIssue2746(t *testing.T, members int) {
  216. defer testutil.AfterTest(t)
  217. c := NewCluster(t, members)
  218. for _, m := range c.Members {
  219. m.SnapshotCount = 10
  220. }
  221. c.Launch(t)
  222. defer c.Terminate(t)
  223. // force a snapshot
  224. for i := 0; i < 20; i++ {
  225. clusterMustProgress(t, c.Members)
  226. }
  227. c.RemoveMember(t, uint64(c.Members[members-1].s.ID()))
  228. c.waitLeader(t, c.Members)
  229. c.AddMember(t)
  230. c.waitLeader(t, c.Members)
  231. clusterMustProgress(t, c.Members)
  232. }
  233. // Ensure etcd will not panic when removing a just started member.
  234. func TestIssue2904(t *testing.T) {
  235. defer testutil.AfterTest(t)
  236. // start 1-member cluster to ensure member 0 is the leader of the cluster.
  237. c := NewCluster(t, 1)
  238. c.Launch(t)
  239. defer c.Terminate(t)
  240. c.AddMember(t)
  241. c.Members[1].Stop(t)
  242. // send remove member-1 request to the cluster.
  243. cc := MustNewHTTPClient(t, c.URLs(), nil)
  244. ma := client.NewMembersAPI(cc)
  245. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  246. // the proposal is not committed because member 1 is stopped, but the
  247. // proposal is appended to leader's raft log.
  248. ma.Remove(ctx, c.Members[1].s.ID().String())
  249. cancel()
  250. // restart member, and expect it to send UpdateAttributes request.
  251. // the log in the leader is like this:
  252. // [..., remove 1, ..., update attr 1, ...]
  253. c.Members[1].Restart(t)
  254. // when the member comes back, it ack the proposal to remove itself,
  255. // and apply it.
  256. <-c.Members[1].s.StopNotify()
  257. // terminate removed member
  258. c.Members[1].Terminate(t)
  259. c.Members = c.Members[:1]
  260. // wait member to be removed.
  261. c.waitMembersMatch(t, c.HTTPMembers())
  262. }
  263. // TestIssue3699 tests minority failure during cluster configuration; it was
  264. // deadlocking.
  265. func TestIssue3699(t *testing.T) {
  266. // start a cluster of 3 nodes a, b, c
  267. defer testutil.AfterTest(t)
  268. c := NewCluster(t, 3)
  269. c.Launch(t)
  270. defer c.Terminate(t)
  271. // make node a unavailable
  272. c.Members[0].Stop(t)
  273. // add node d
  274. c.AddMember(t)
  275. // electing node d as leader makes node a unable to participate
  276. leaderID := c.waitLeader(t, c.Members)
  277. for leaderID != 3 {
  278. c.Members[leaderID].Stop(t)
  279. <-c.Members[leaderID].s.StopNotify()
  280. // do not restart the killed member immediately.
  281. // the member will advance its election timeout after restart,
  282. // so it will have a better chance to become the leader again.
  283. time.Sleep(time.Duration(electionTicks * int(tickDuration)))
  284. c.Members[leaderID].Restart(t)
  285. leaderID = c.waitLeader(t, c.Members)
  286. }
  287. // bring back node a
  288. // node a will remain useless as long as d is the leader.
  289. if err := c.Members[0].Restart(t); err != nil {
  290. t.Fatal(err)
  291. }
  292. select {
  293. // waiting for ReadyNotify can take several seconds
  294. case <-time.After(10 * time.Second):
  295. t.Fatalf("waited too long for ready notification")
  296. case <-c.Members[0].s.StopNotify():
  297. t.Fatalf("should not be stopped")
  298. case <-c.Members[0].s.ReadyNotify():
  299. }
  300. // must waitLeader so goroutines don't leak on terminate
  301. c.waitLeader(t, c.Members)
  302. // try to participate in cluster
  303. cc := MustNewHTTPClient(t, []string{c.URL(0)}, c.cfg.ClientTLS)
  304. kapi := client.NewKeysAPI(cc)
  305. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  306. if _, err := kapi.Set(ctx, "/foo", "bar", nil); err != nil {
  307. t.Fatalf("unexpected error on Set (%v)", err)
  308. }
  309. cancel()
  310. }
  311. // TestRejectUnhealthyAdd ensures an unhealthy cluster rejects adding members.
  312. func TestRejectUnhealthyAdd(t *testing.T) {
  313. defer testutil.AfterTest(t)
  314. c := NewCluster(t, 3)
  315. for _, m := range c.Members {
  316. m.ServerConfig.StrictReconfigCheck = true
  317. }
  318. c.Launch(t)
  319. defer c.Terminate(t)
  320. // make cluster unhealthy and wait for downed peer
  321. c.Members[0].Stop(t)
  322. c.WaitLeader(t)
  323. // all attempts to add member should fail
  324. for i := 1; i < len(c.Members); i++ {
  325. err := c.addMemberByURL(t, c.URL(i), "unix://foo:12345")
  326. if err == nil {
  327. t.Fatalf("should have failed adding peer")
  328. }
  329. // TODO: client should return descriptive error codes for internal errors
  330. if !strings.Contains(err.Error(), "has no leader") {
  331. t.Errorf("unexpected error (%v)", err)
  332. }
  333. }
  334. // make cluster healthy
  335. c.Members[0].Restart(t)
  336. c.WaitLeader(t)
  337. time.Sleep(2 * etcdserver.HealthInterval)
  338. // add member should succeed now that it's healthy
  339. var err error
  340. for i := 1; i < len(c.Members); i++ {
  341. if err = c.addMemberByURL(t, c.URL(i), "unix://foo:12345"); err == nil {
  342. break
  343. }
  344. }
  345. if err != nil {
  346. t.Fatalf("should have added peer to healthy cluster (%v)", err)
  347. }
  348. }
  349. // TestRejectUnhealthyRemove ensures an unhealthy cluster rejects removing members
  350. // if quorum will be lost.
  351. func TestRejectUnhealthyRemove(t *testing.T) {
  352. defer testutil.AfterTest(t)
  353. c := NewCluster(t, 5)
  354. for _, m := range c.Members {
  355. m.ServerConfig.StrictReconfigCheck = true
  356. }
  357. c.Launch(t)
  358. defer c.Terminate(t)
  359. // make cluster unhealthy and wait for downed peer; (3 up, 2 down)
  360. c.Members[0].Stop(t)
  361. c.Members[1].Stop(t)
  362. c.WaitLeader(t)
  363. // reject remove active member since (3,2)-(1,0) => (2,2) lacks quorum
  364. err := c.removeMember(t, uint64(c.Members[2].s.ID()))
  365. if err == nil {
  366. t.Fatalf("should reject quorum breaking remove")
  367. }
  368. // TODO: client should return more descriptive error codes for internal errors
  369. if !strings.Contains(err.Error(), "has no leader") {
  370. t.Errorf("unexpected error (%v)", err)
  371. }
  372. // member stopped after launch; wait for missing heartbeats
  373. time.Sleep(time.Duration(electionTicks * int(tickDuration)))
  374. // permit remove dead member since (3,2) - (0,1) => (3,1) has quorum
  375. if err = c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil {
  376. t.Fatalf("should accept removing down member")
  377. }
  378. // bring cluster to (4,1)
  379. c.Members[0].Restart(t)
  380. // restarted member must be connected for a HealthInterval before remove is accepted
  381. time.Sleep((3 * etcdserver.HealthInterval) / 2)
  382. // accept remove member since (4,1)-(1,0) => (3,1) has quorum
  383. if err = c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil {
  384. t.Fatalf("expected to remove member, got error %v", err)
  385. }
  386. }
  387. // TestRestartRemoved ensures that restarting removed member must exit
  388. // if 'initial-cluster-state' is set 'new' and old data directory still exists
  389. // (see https://github.com/etcd-io/etcd/issues/7512 for more).
  390. func TestRestartRemoved(t *testing.T) {
  391. defer testutil.AfterTest(t)
  392. // 1. start single-member cluster
  393. c := NewCluster(t, 1)
  394. for _, m := range c.Members {
  395. m.ServerConfig.StrictReconfigCheck = true
  396. }
  397. c.Launch(t)
  398. defer c.Terminate(t)
  399. // 2. add a new member
  400. c.AddMember(t)
  401. c.WaitLeader(t)
  402. oldm := c.Members[0]
  403. oldm.keepDataDirTerminate = true
  404. // 3. remove first member, shut down without deleting data
  405. if err := c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil {
  406. t.Fatalf("expected to remove member, got error %v", err)
  407. }
  408. c.WaitLeader(t)
  409. // 4. restart first member with 'initial-cluster-state=new'
  410. // wrong config, expects exit within ReqTimeout
  411. oldm.ServerConfig.NewCluster = false
  412. if err := oldm.Restart(t); err != nil {
  413. t.Fatalf("unexpected ForceRestart error: %v", err)
  414. }
  415. defer func() {
  416. oldm.Close()
  417. os.RemoveAll(oldm.ServerConfig.DataDir)
  418. }()
  419. select {
  420. case <-oldm.s.StopNotify():
  421. case <-time.After(time.Minute):
  422. t.Fatalf("removed member didn't exit within %v", time.Minute)
  423. }
  424. }
  425. // clusterMustProgress ensures that cluster can make progress. It creates
  426. // a random key first, and check the new key could be got from all client urls
  427. // of the cluster.
  428. func clusterMustProgress(t *testing.T, membs []*member) {
  429. cc := MustNewHTTPClient(t, []string{membs[0].URL()}, nil)
  430. kapi := client.NewKeysAPI(cc)
  431. key := fmt.Sprintf("foo%d", rand.Int())
  432. var (
  433. err error
  434. resp *client.Response
  435. )
  436. // retry in case of leader loss induced by slow CI
  437. for i := 0; i < 3; i++ {
  438. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  439. resp, err = kapi.Create(ctx, "/"+key, "bar")
  440. cancel()
  441. if err == nil {
  442. break
  443. }
  444. t.Logf("failed to create key on %q (%v)", membs[0].URL(), err)
  445. }
  446. if err != nil {
  447. t.Fatalf("create on %s error: %v", membs[0].URL(), err)
  448. }
  449. for i, m := range membs {
  450. u := m.URL()
  451. mcc := MustNewHTTPClient(t, []string{u}, nil)
  452. mkapi := client.NewKeysAPI(mcc)
  453. mctx, mcancel := context.WithTimeout(context.Background(), requestTimeout)
  454. if _, err := mkapi.Watcher(key, &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(mctx); err != nil {
  455. t.Fatalf("#%d: watch on %s error: %v", i, u, err)
  456. }
  457. mcancel()
  458. }
  459. }
  460. func TestSpeedyTerminate(t *testing.T) {
  461. defer testutil.AfterTest(t)
  462. clus := NewClusterV3(t, &ClusterConfig{Size: 3})
  463. // Stop/Restart so requests will time out on lost leaders
  464. for i := 0; i < 3; i++ {
  465. clus.Members[i].Stop(t)
  466. clus.Members[i].Restart(t)
  467. }
  468. donec := make(chan struct{})
  469. go func() {
  470. defer close(donec)
  471. clus.Terminate(t)
  472. }()
  473. select {
  474. case <-time.After(10 * time.Second):
  475. t.Fatalf("cluster took too long to terminate")
  476. case <-donec:
  477. }
  478. }