cluster_test.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package integration
  15. import (
  16. "fmt"
  17. "log"
  18. "math/rand"
  19. "os"
  20. "strconv"
  21. "strings"
  22. "testing"
  23. "time"
  24. "github.com/coreos/etcd/client"
  25. "github.com/coreos/etcd/etcdserver"
  26. "github.com/coreos/etcd/pkg/testutil"
  27. "github.com/coreos/pkg/capnslog"
  28. "golang.org/x/net/context"
  29. )
  30. func init() {
  31. // open microsecond-level time log for integration test debugging
  32. log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile)
  33. if t := os.Getenv("ETCD_ELECTION_TIMEOUT_TICKS"); t != "" {
  34. if i, err := strconv.ParseInt(t, 10, 64); err == nil {
  35. electionTicks = int(i)
  36. }
  37. }
  38. }
  39. func TestClusterOf1(t *testing.T) { testCluster(t, 1) }
  40. func TestClusterOf3(t *testing.T) { testCluster(t, 3) }
  41. func testCluster(t *testing.T, size int) {
  42. defer testutil.AfterTest(t)
  43. c := NewCluster(t, size)
  44. c.Launch(t)
  45. defer c.Terminate(t)
  46. clusterMustProgress(t, c.Members)
  47. }
  48. func TestTLSClusterOf3(t *testing.T) {
  49. defer testutil.AfterTest(t)
  50. c := NewClusterByConfig(t, &ClusterConfig{Size: 3, PeerTLS: &testTLSInfo})
  51. c.Launch(t)
  52. defer c.Terminate(t)
  53. clusterMustProgress(t, c.Members)
  54. }
  55. func TestClusterOf1UsingDiscovery(t *testing.T) { testClusterUsingDiscovery(t, 1) }
  56. func TestClusterOf3UsingDiscovery(t *testing.T) { testClusterUsingDiscovery(t, 3) }
  57. func testClusterUsingDiscovery(t *testing.T, size int) {
  58. defer testutil.AfterTest(t)
  59. dc := NewCluster(t, 1)
  60. dc.Launch(t)
  61. defer dc.Terminate(t)
  62. // init discovery token space
  63. dcc := MustNewHTTPClient(t, dc.URLs(), nil)
  64. dkapi := client.NewKeysAPI(dcc)
  65. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  66. if _, err := dkapi.Create(ctx, "/_config/size", fmt.Sprintf("%d", size)); err != nil {
  67. t.Fatal(err)
  68. }
  69. cancel()
  70. c := NewClusterByConfig(
  71. t,
  72. &ClusterConfig{Size: size, DiscoveryURL: dc.URL(0) + "/v2/keys"},
  73. )
  74. c.Launch(t)
  75. defer c.Terminate(t)
  76. clusterMustProgress(t, c.Members)
  77. }
  78. func TestTLSClusterOf3UsingDiscovery(t *testing.T) {
  79. defer testutil.AfterTest(t)
  80. dc := NewCluster(t, 1)
  81. dc.Launch(t)
  82. defer dc.Terminate(t)
  83. // init discovery token space
  84. dcc := MustNewHTTPClient(t, dc.URLs(), nil)
  85. dkapi := client.NewKeysAPI(dcc)
  86. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  87. if _, err := dkapi.Create(ctx, "/_config/size", fmt.Sprintf("%d", 3)); err != nil {
  88. t.Fatal(err)
  89. }
  90. cancel()
  91. c := NewClusterByConfig(t,
  92. &ClusterConfig{
  93. Size: 3,
  94. PeerTLS: &testTLSInfo,
  95. DiscoveryURL: dc.URL(0) + "/v2/keys"},
  96. )
  97. c.Launch(t)
  98. defer c.Terminate(t)
  99. clusterMustProgress(t, c.Members)
  100. }
  101. func TestDoubleClusterSizeOf1(t *testing.T) { testDoubleClusterSize(t, 1) }
  102. func TestDoubleClusterSizeOf3(t *testing.T) { testDoubleClusterSize(t, 3) }
  103. func testDoubleClusterSize(t *testing.T, size int) {
  104. defer testutil.AfterTest(t)
  105. c := NewCluster(t, size)
  106. c.Launch(t)
  107. defer c.Terminate(t)
  108. for i := 0; i < size; i++ {
  109. c.AddMember(t)
  110. }
  111. clusterMustProgress(t, c.Members)
  112. }
  113. func TestDoubleTLSClusterSizeOf3(t *testing.T) {
  114. defer testutil.AfterTest(t)
  115. c := NewClusterByConfig(t, &ClusterConfig{Size: 3, PeerTLS: &testTLSInfo})
  116. c.Launch(t)
  117. defer c.Terminate(t)
  118. for i := 0; i < 3; i++ {
  119. c.AddMember(t)
  120. }
  121. clusterMustProgress(t, c.Members)
  122. }
  123. func TestDecreaseClusterSizeOf3(t *testing.T) { testDecreaseClusterSize(t, 3) }
  124. func TestDecreaseClusterSizeOf5(t *testing.T) { testDecreaseClusterSize(t, 5) }
  125. func testDecreaseClusterSize(t *testing.T, size int) {
  126. defer testutil.AfterTest(t)
  127. c := NewCluster(t, size)
  128. c.Launch(t)
  129. defer c.Terminate(t)
  130. // TODO: remove the last but one member
  131. for i := 0; i < size-1; i++ {
  132. id := c.Members[len(c.Members)-1].s.ID()
  133. c.RemoveMember(t, uint64(id))
  134. c.waitLeader(t, c.Members)
  135. }
  136. clusterMustProgress(t, c.Members)
  137. }
  138. func TestForceNewCluster(t *testing.T) {
  139. c := NewCluster(t, 3)
  140. c.Launch(t)
  141. cc := MustNewHTTPClient(t, []string{c.Members[0].URL()}, nil)
  142. kapi := client.NewKeysAPI(cc)
  143. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  144. resp, err := kapi.Create(ctx, "/foo", "bar")
  145. if err != nil {
  146. t.Fatalf("unexpected create error: %v", err)
  147. }
  148. cancel()
  149. // ensure create has been applied in this machine
  150. ctx, cancel = context.WithTimeout(context.Background(), requestTimeout)
  151. if _, err = kapi.Watcher("/foo", &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(ctx); err != nil {
  152. t.Fatalf("unexpected watch error: %v", err)
  153. }
  154. cancel()
  155. c.Members[0].Stop(t)
  156. c.Members[1].Terminate(t)
  157. c.Members[2].Terminate(t)
  158. c.Members[0].ForceNewCluster = true
  159. err = c.Members[0].Restart(t)
  160. if err != nil {
  161. t.Fatalf("unexpected ForceRestart error: %v", err)
  162. }
  163. defer c.Members[0].Terminate(t)
  164. c.waitLeader(t, c.Members[:1])
  165. // use new http client to init new connection
  166. cc = MustNewHTTPClient(t, []string{c.Members[0].URL()}, nil)
  167. kapi = client.NewKeysAPI(cc)
  168. // ensure force restart keep the old data, and new cluster can make progress
  169. ctx, cancel = context.WithTimeout(context.Background(), requestTimeout)
  170. if _, err := kapi.Watcher("/foo", &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(ctx); err != nil {
  171. t.Fatalf("unexpected watch error: %v", err)
  172. }
  173. cancel()
  174. clusterMustProgress(t, c.Members[:1])
  175. }
  176. func TestAddMemberAfterClusterFullRotation(t *testing.T) {
  177. defer testutil.AfterTest(t)
  178. c := NewCluster(t, 3)
  179. c.Launch(t)
  180. defer c.Terminate(t)
  181. // remove all the previous three members and add in three new members.
  182. for i := 0; i < 3; i++ {
  183. c.RemoveMember(t, uint64(c.Members[0].s.ID()))
  184. c.waitLeader(t, c.Members)
  185. c.AddMember(t)
  186. c.waitLeader(t, c.Members)
  187. }
  188. c.AddMember(t)
  189. c.waitLeader(t, c.Members)
  190. clusterMustProgress(t, c.Members)
  191. }
  192. // Ensure we can remove a member then add a new one back immediately.
  193. func TestIssue2681(t *testing.T) {
  194. defer testutil.AfterTest(t)
  195. c := NewCluster(t, 5)
  196. c.Launch(t)
  197. defer c.Terminate(t)
  198. c.RemoveMember(t, uint64(c.Members[4].s.ID()))
  199. c.waitLeader(t, c.Members)
  200. c.AddMember(t)
  201. c.waitLeader(t, c.Members)
  202. clusterMustProgress(t, c.Members)
  203. }
  204. // Ensure we can remove a member after a snapshot then add a new one back.
  205. func TestIssue2746(t *testing.T) { testIssue2746(t, 5) }
  206. // With 3 nodes TestIssue2476 sometimes had a shutdown with an inflight snapshot.
  207. func TestIssue2746WithThree(t *testing.T) { testIssue2746(t, 3) }
  208. func testIssue2746(t *testing.T, members int) {
  209. defer testutil.AfterTest(t)
  210. c := NewCluster(t, members)
  211. for _, m := range c.Members {
  212. m.SnapCount = 10
  213. }
  214. c.Launch(t)
  215. defer c.Terminate(t)
  216. // force a snapshot
  217. for i := 0; i < 20; i++ {
  218. clusterMustProgress(t, c.Members)
  219. }
  220. c.RemoveMember(t, uint64(c.Members[members-1].s.ID()))
  221. c.waitLeader(t, c.Members)
  222. c.AddMember(t)
  223. c.waitLeader(t, c.Members)
  224. clusterMustProgress(t, c.Members)
  225. }
  226. // Ensure etcd will not panic when removing a just started member.
  227. func TestIssue2904(t *testing.T) {
  228. defer testutil.AfterTest(t)
  229. // start 1-member cluster to ensure member 0 is the leader of the cluster.
  230. c := NewCluster(t, 1)
  231. c.Launch(t)
  232. defer c.Terminate(t)
  233. c.AddMember(t)
  234. c.Members[1].Stop(t)
  235. // send remove member-1 request to the cluster.
  236. cc := MustNewHTTPClient(t, c.URLs(), nil)
  237. ma := client.NewMembersAPI(cc)
  238. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  239. // the proposal is not committed because member 1 is stopped, but the
  240. // proposal is appended to leader's raft log.
  241. ma.Remove(ctx, c.Members[1].s.ID().String())
  242. cancel()
  243. // restart member, and expect it to send UpdateAttributes request.
  244. // the log in the leader is like this:
  245. // [..., remove 1, ..., update attr 1, ...]
  246. c.Members[1].Restart(t)
  247. // when the member comes back, it ack the proposal to remove itself,
  248. // and apply it.
  249. <-c.Members[1].s.StopNotify()
  250. // terminate removed member
  251. c.Members[1].Terminate(t)
  252. c.Members = c.Members[:1]
  253. // wait member to be removed.
  254. c.waitMembersMatch(t, c.HTTPMembers())
  255. }
  256. // TestIssue3699 tests minority failure during cluster configuration; it was
  257. // deadlocking.
  258. func TestIssue3699(t *testing.T) {
  259. // start a cluster of 3 nodes a, b, c
  260. defer testutil.AfterTest(t)
  261. c := NewCluster(t, 3)
  262. c.Launch(t)
  263. defer c.Terminate(t)
  264. // make node a unavailable
  265. c.Members[0].Stop(t)
  266. // add node d
  267. c.AddMember(t)
  268. // electing node d as leader makes node a unable to participate
  269. leaderID := c.waitLeader(t, c.Members)
  270. for leaderID != 3 {
  271. c.Members[leaderID].Stop(t)
  272. <-c.Members[leaderID].s.StopNotify()
  273. // do not restart the killed member immediately.
  274. // the member will advance its election timeout after restart,
  275. // so it will have a better chance to become the leader again.
  276. time.Sleep(time.Duration(electionTicks * int(tickDuration)))
  277. c.Members[leaderID].Restart(t)
  278. leaderID = c.waitLeader(t, c.Members)
  279. }
  280. // bring back node a
  281. // node a will remain useless as long as d is the leader.
  282. if err := c.Members[0].Restart(t); err != nil {
  283. t.Fatal(err)
  284. }
  285. select {
  286. // waiting for ReadyNotify can take several seconds
  287. case <-time.After(10 * time.Second):
  288. t.Fatalf("waited too long for ready notification")
  289. case <-c.Members[0].s.StopNotify():
  290. t.Fatalf("should not be stopped")
  291. case <-c.Members[0].s.ReadyNotify():
  292. }
  293. // must waitLeader so goroutines don't leak on terminate
  294. c.waitLeader(t, c.Members)
  295. // try to participate in cluster
  296. cc := MustNewHTTPClient(t, []string{c.URL(0)}, c.cfg.ClientTLS)
  297. kapi := client.NewKeysAPI(cc)
  298. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  299. if _, err := kapi.Set(ctx, "/foo", "bar", nil); err != nil {
  300. t.Fatalf("unexpected error on Set (%v)", err)
  301. }
  302. cancel()
  303. }
  304. // TestRejectUnhealthyAdd ensures an unhealthy cluster rejects adding members.
  305. func TestRejectUnhealthyAdd(t *testing.T) {
  306. defer testutil.AfterTest(t)
  307. c := NewCluster(t, 3)
  308. for _, m := range c.Members {
  309. m.ServerConfig.StrictReconfigCheck = true
  310. }
  311. c.Launch(t)
  312. defer c.Terminate(t)
  313. // make cluster unhealthy and wait for downed peer
  314. c.Members[0].Stop(t)
  315. c.WaitLeader(t)
  316. // all attempts to add member should fail
  317. for i := 1; i < len(c.Members); i++ {
  318. err := c.addMemberByURL(t, c.URL(i), "unix://foo:12345")
  319. if err == nil {
  320. t.Fatalf("should have failed adding peer")
  321. }
  322. // TODO: client should return descriptive error codes for internal errors
  323. if !strings.Contains(err.Error(), "has no leader") {
  324. t.Errorf("unexpected error (%v)", err)
  325. }
  326. }
  327. // make cluster healthy
  328. c.Members[0].Restart(t)
  329. c.WaitLeader(t)
  330. time.Sleep(2 * etcdserver.HealthInterval)
  331. // add member should succeed now that it's healthy
  332. var err error
  333. for i := 1; i < len(c.Members); i++ {
  334. if err = c.addMemberByURL(t, c.URL(i), "unix://foo:12345"); err == nil {
  335. break
  336. }
  337. }
  338. if err != nil {
  339. t.Fatalf("should have added peer to healthy cluster (%v)", err)
  340. }
  341. }
  342. // TestRejectUnhealthyRemove ensures an unhealthy cluster rejects removing members
  343. // if quorum will be lost.
  344. func TestRejectUnhealthyRemove(t *testing.T) {
  345. defer testutil.AfterTest(t)
  346. c := NewCluster(t, 5)
  347. for _, m := range c.Members {
  348. m.ServerConfig.StrictReconfigCheck = true
  349. }
  350. c.Launch(t)
  351. defer c.Terminate(t)
  352. // make cluster unhealthy and wait for downed peer; (3 up, 2 down)
  353. c.Members[0].Stop(t)
  354. c.Members[1].Stop(t)
  355. c.WaitLeader(t)
  356. // reject remove active member since (3,2)-(1,0) => (2,2) lacks quorum
  357. err := c.removeMember(t, uint64(c.Members[2].s.ID()))
  358. if err == nil {
  359. t.Fatalf("should reject quorum breaking remove")
  360. }
  361. // TODO: client should return more descriptive error codes for internal errors
  362. if !strings.Contains(err.Error(), "has no leader") {
  363. t.Errorf("unexpected error (%v)", err)
  364. }
  365. // member stopped after launch; wait for missing heartbeats
  366. time.Sleep(time.Duration(electionTicks * int(tickDuration)))
  367. // permit remove dead member since (3,2) - (0,1) => (3,1) has quorum
  368. if err = c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil {
  369. t.Fatalf("should accept removing down member")
  370. }
  371. // bring cluster to (4,1)
  372. c.Members[0].Restart(t)
  373. // restarted member must be connected for a HealthInterval before remove is accepted
  374. time.Sleep((3 * etcdserver.HealthInterval) / 2)
  375. // accept remove member since (4,1)-(1,0) => (3,1) has quorum
  376. if err = c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil {
  377. t.Fatalf("expected to remove member, got error %v", err)
  378. }
  379. }
  380. // TestRestartRemoved ensures that restarting removed member must exit
  381. // if 'initial-cluster-state' is set 'new' and old data directory still exists
  382. // (see https://github.com/coreos/etcd/issues/7512 for more).
  383. func TestRestartRemoved(t *testing.T) {
  384. defer testutil.AfterTest(t)
  385. capnslog.SetGlobalLogLevel(capnslog.INFO)
  386. defer capnslog.SetGlobalLogLevel(defaultLogLevel)
  387. // 1. start single-member cluster
  388. c := NewCluster(t, 1)
  389. for _, m := range c.Members {
  390. m.ServerConfig.StrictReconfigCheck = true
  391. }
  392. c.Launch(t)
  393. defer c.Terminate(t)
  394. // 2. add a new member
  395. c.AddMember(t)
  396. c.WaitLeader(t)
  397. oldm := c.Members[0]
  398. oldm.keepDataDirTerminate = true
  399. // 3. remove first member, shut down without deleting data
  400. if err := c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil {
  401. t.Fatalf("expected to remove member, got error %v", err)
  402. }
  403. c.WaitLeader(t)
  404. // 4. restart first member with 'initial-cluster-state=new'
  405. // wrong config, expects exit within ReqTimeout
  406. oldm.ServerConfig.NewCluster = false
  407. if err := oldm.Restart(t); err != nil {
  408. t.Fatalf("unexpected ForceRestart error: %v", err)
  409. }
  410. defer func() {
  411. oldm.Close()
  412. os.RemoveAll(oldm.ServerConfig.DataDir)
  413. }()
  414. select {
  415. case <-oldm.s.StopNotify():
  416. case <-time.After(time.Minute):
  417. t.Fatalf("removed member didn't exit within %v", time.Minute)
  418. }
  419. }
  420. // clusterMustProgress ensures that cluster can make progress. It creates
  421. // a random key first, and check the new key could be got from all client urls
  422. // of the cluster.
  423. func clusterMustProgress(t *testing.T, membs []*member) {
  424. cc := MustNewHTTPClient(t, []string{membs[0].URL()}, nil)
  425. kapi := client.NewKeysAPI(cc)
  426. key := fmt.Sprintf("foo%d", rand.Int())
  427. var (
  428. err error
  429. resp *client.Response
  430. )
  431. // retry in case of leader loss induced by slow CI
  432. for i := 0; i < 3; i++ {
  433. ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
  434. resp, err = kapi.Create(ctx, "/"+key, "bar")
  435. cancel()
  436. if err == nil {
  437. break
  438. }
  439. t.Logf("failed to create key on %q (%v)", membs[0].URL(), err)
  440. }
  441. if err != nil {
  442. t.Fatalf("create on %s error: %v", membs[0].URL(), err)
  443. }
  444. for i, m := range membs {
  445. u := m.URL()
  446. mcc := MustNewHTTPClient(t, []string{u}, nil)
  447. mkapi := client.NewKeysAPI(mcc)
  448. mctx, mcancel := context.WithTimeout(context.Background(), requestTimeout)
  449. if _, err := mkapi.Watcher(key, &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(mctx); err != nil {
  450. t.Fatalf("#%d: watch on %s error: %v", i, u, err)
  451. }
  452. mcancel()
  453. }
  454. }
  455. func TestTransferLeader(t *testing.T) {
  456. defer testutil.AfterTest(t)
  457. clus := NewClusterV3(t, &ClusterConfig{Size: 3})
  458. defer clus.Terminate(t)
  459. oldLeadIdx := clus.WaitLeader(t)
  460. oldLeadID := uint64(clus.Members[oldLeadIdx].s.ID())
  461. // ensure followers go through leader transition while learship transfer
  462. idc := make(chan uint64)
  463. for i := range clus.Members {
  464. if oldLeadIdx != i {
  465. go func(m *member) {
  466. idc <- checkLeaderTransition(t, m, oldLeadID)
  467. }(clus.Members[i])
  468. }
  469. }
  470. err := clus.Members[oldLeadIdx].s.TransferLeadership()
  471. if err != nil {
  472. t.Fatal(err)
  473. }
  474. // wait until leader transitions have happened
  475. var newLeadIDs [2]uint64
  476. for i := range newLeadIDs {
  477. select {
  478. case newLeadIDs[i] = <-idc:
  479. case <-time.After(time.Second):
  480. t.Fatal("timed out waiting for leader transition")
  481. }
  482. }
  483. // remaining members must agree on the same leader
  484. if newLeadIDs[0] != newLeadIDs[1] {
  485. t.Fatalf("expected same new leader %d == %d", newLeadIDs[0], newLeadIDs[1])
  486. }
  487. // new leader must be different than the old leader
  488. if oldLeadID == newLeadIDs[0] {
  489. t.Fatalf("expected old leader %d != new leader %d", oldLeadID, newLeadIDs[0])
  490. }
  491. }
  492. func TestSpeedyTerminate(t *testing.T) {
  493. defer testutil.AfterTest(t)
  494. clus := NewClusterV3(t, &ClusterConfig{Size: 3})
  495. // Stop/Restart so requests will time out on lost leaders
  496. for i := 0; i < 3; i++ {
  497. clus.Members[i].Stop(t)
  498. clus.Members[i].Restart(t)
  499. }
  500. donec := make(chan struct{})
  501. go func() {
  502. defer close(donec)
  503. clus.Terminate(t)
  504. }()
  505. select {
  506. case <-time.After(10 * time.Second):
  507. t.Fatalf("cluster took too long to terminate")
  508. case <-donec:
  509. }
  510. }