multi_node_kill_all_and_recovery_test.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. package test
  2. import (
  3. "bytes"
  4. "os"
  5. "strconv"
  6. "strings"
  7. "testing"
  8. "time"
  9. "github.com/coreos/etcd/server"
  10. "github.com/coreos/etcd/tests"
  11. "github.com/coreos/etcd/third_party/github.com/coreos/go-etcd/etcd"
  12. "github.com/coreos/etcd/third_party/github.com/stretchr/testify/assert"
  13. )
  14. // TestTLSMultiNodeKillAllAndRecovery create a five nodes
  15. // then kill all the nodes and restart
  16. func TestTLSMultiNodeKillAllAndRecovery(t *testing.T) {
  17. procAttr := new(os.ProcAttr)
  18. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  19. stop := make(chan bool)
  20. leaderChan := make(chan string, 1)
  21. all := make(chan bool, 1)
  22. clusterSize := 5
  23. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, true)
  24. defer DestroyCluster(etcds)
  25. if err != nil {
  26. t.Fatal("cannot create cluster")
  27. }
  28. time.Sleep(time.Second)
  29. c := etcd.NewClient(nil)
  30. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  31. <-all
  32. <-leaderChan
  33. stop <- true
  34. c.SyncCluster()
  35. // send 10 commands
  36. for i := 0; i < 10; i++ {
  37. // Test Set
  38. _, err := c.Set("foo", "bar", 0)
  39. if err != nil {
  40. panic(err)
  41. }
  42. }
  43. time.Sleep(time.Second)
  44. // kill all
  45. DestroyCluster(etcds)
  46. time.Sleep(time.Second)
  47. stop = make(chan bool)
  48. leaderChan = make(chan string, 1)
  49. all = make(chan bool, 1)
  50. time.Sleep(time.Second)
  51. for i := 0; i < clusterSize; i++ {
  52. etcds[i], err = os.StartProcess(EtcdBinPath, argGroup[i], procAttr)
  53. // See util.go for the reason to wait for server
  54. client := buildClient()
  55. err = WaitForServer("127.0.0.1:400"+strconv.Itoa(i+1), client, "http")
  56. if err != nil {
  57. t.Fatalf("node start error: %s", err)
  58. }
  59. }
  60. go Monitor(clusterSize, 1, leaderChan, all, stop)
  61. <-all
  62. <-leaderChan
  63. result, err := c.Set("foo", "bar", 0)
  64. if err != nil {
  65. t.Fatalf("Recovery error: %s", err)
  66. }
  67. if result.Node.ModifiedIndex != 17 {
  68. t.Fatalf("recovery failed! [%d/17]", result.Node.ModifiedIndex)
  69. }
  70. }
  71. // Create a five-node cluster
  72. // Kill all the nodes and restart
  73. func TestMultiNodeKillAllAndRecoveryWithStandbys(t *testing.T) {
  74. procAttr := new(os.ProcAttr)
  75. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  76. stop := make(chan bool)
  77. leaderChan := make(chan string, 1)
  78. all := make(chan bool, 1)
  79. clusterSize := 15
  80. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false)
  81. defer DestroyCluster(etcds)
  82. if err != nil {
  83. t.Fatal("cannot create cluster")
  84. }
  85. c := etcd.NewClient(nil)
  86. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  87. <-all
  88. <-leaderChan
  89. stop <- true
  90. c.SyncCluster()
  91. // Reconfigure with smaller active size (7 nodes) and wait for remove.
  92. resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":7}`))
  93. if !assert.Equal(t, resp.StatusCode, 200) {
  94. t.FailNow()
  95. }
  96. time.Sleep(2*server.ActiveMonitorTimeout + (1 * time.Second))
  97. // Verify that there is three machines in peer mode.
  98. result, err := c.Get("_etcd/machines", false, true)
  99. assert.NoError(t, err)
  100. assert.Equal(t, len(result.Node.Nodes), 7)
  101. // send set commands
  102. for i := 0; i < 2*clusterSize; i++ {
  103. // Test Set
  104. _, err := c.Set("foo", "bar", 0)
  105. if err != nil {
  106. panic(err)
  107. }
  108. }
  109. time.Sleep(time.Second)
  110. // kill all
  111. DestroyCluster(etcds)
  112. time.Sleep(time.Second)
  113. stop = make(chan bool)
  114. leaderChan = make(chan string, 1)
  115. all = make(chan bool, 1)
  116. time.Sleep(time.Second)
  117. for i := 0; i < clusterSize; i++ {
  118. etcds[i], err = os.StartProcess(EtcdBinPath, append(argGroup[i], "-peers="), procAttr)
  119. }
  120. time.Sleep(2 * time.Second)
  121. // send set commands
  122. for i := 0; i < 2*clusterSize; i++ {
  123. // Test Set
  124. _, err := c.Set("foo", "bar", 0)
  125. if err != nil {
  126. t.Fatalf("Recovery error: %s", err)
  127. }
  128. }
  129. // Verify that we have seven machines.
  130. result, err = c.Get("_etcd/machines", false, true)
  131. assert.NoError(t, err)
  132. assert.Equal(t, len(result.Node.Nodes), 7)
  133. }
  134. // Create a five nodes
  135. // Kill all the nodes and restart, then remove the leader
  136. func TestMultiNodeKillAllAndRecoveryAndRemoveLeader(t *testing.T) {
  137. procAttr := new(os.ProcAttr)
  138. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  139. stop := make(chan bool)
  140. leaderChan := make(chan string, 1)
  141. all := make(chan bool, 1)
  142. clusterSize := 5
  143. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false)
  144. defer DestroyCluster(etcds)
  145. if err != nil {
  146. t.Fatal("cannot create cluster")
  147. }
  148. c := etcd.NewClient(nil)
  149. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  150. <-all
  151. <-leaderChan
  152. stop <- true
  153. // It needs some time to sync current commits and write it to disk.
  154. // Or some instance may be restarted as a new peer, and we don't support
  155. // to connect back the old cluster that doesn't have majority alive
  156. // without log now.
  157. time.Sleep(time.Second)
  158. c.SyncCluster()
  159. // kill all
  160. DestroyCluster(etcds)
  161. time.Sleep(time.Second)
  162. stop = make(chan bool)
  163. leaderChan = make(chan string, 1)
  164. all = make(chan bool, 1)
  165. time.Sleep(time.Second)
  166. for i := 0; i < clusterSize; i++ {
  167. etcds[i], err = os.StartProcess(EtcdBinPath, argGroup[i], procAttr)
  168. }
  169. go Monitor(clusterSize, 1, leaderChan, all, stop)
  170. <-all
  171. leader := <-leaderChan
  172. _, err = c.Set("foo", "bar", 0)
  173. if err != nil {
  174. t.Fatalf("Recovery error: %s", err)
  175. }
  176. port, _ := strconv.Atoi(strings.Split(leader, ":")[2])
  177. num := port - 7000
  178. resp, _ := tests.Delete(leader+"/v2/admin/machines/node"+strconv.Itoa(num), "application/json", nil)
  179. if !assert.Equal(t, resp.StatusCode, 200) {
  180. t.FailNow()
  181. }
  182. // check the old leader is in standby mode now
  183. time.Sleep(time.Second)
  184. resp, _ = tests.Get(leader + "/name")
  185. assert.Equal(t, resp.StatusCode, 404)
  186. }