multi_node_kill_all_and_recovery_test.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. // +build ignore
  2. package test
  3. import (
  4. "bytes"
  5. "os"
  6. "strconv"
  7. "strings"
  8. "testing"
  9. "time"
  10. "github.com/coreos/etcd/server"
  11. "github.com/coreos/etcd/tests"
  12. "github.com/coreos/etcd/third_party/github.com/coreos/go-etcd/etcd"
  13. "github.com/coreos/etcd/third_party/github.com/stretchr/testify/assert"
  14. )
  15. // TestTLSMultiNodeKillAllAndRecovery create a five nodes
  16. // then kill all the nodes and restart
  17. func TestTLSMultiNodeKillAllAndRecovery(t *testing.T) {
  18. procAttr := new(os.ProcAttr)
  19. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  20. stop := make(chan bool)
  21. leaderChan := make(chan string, 1)
  22. all := make(chan bool, 1)
  23. clusterSize := 5
  24. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, true)
  25. defer DestroyCluster(etcds)
  26. if err != nil {
  27. t.Fatal("cannot create cluster")
  28. }
  29. time.Sleep(time.Second)
  30. c := etcd.NewClient(nil)
  31. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  32. <-all
  33. <-leaderChan
  34. stop <- true
  35. c.SyncCluster()
  36. // send 10 commands
  37. for i := 0; i < 10; i++ {
  38. // Test Set
  39. _, err := c.Set("foo", "bar", 0)
  40. if err != nil {
  41. panic(err)
  42. }
  43. }
  44. time.Sleep(time.Second)
  45. // kill all
  46. DestroyCluster(etcds)
  47. time.Sleep(time.Second)
  48. stop = make(chan bool)
  49. leaderChan = make(chan string, 1)
  50. all = make(chan bool, 1)
  51. time.Sleep(time.Second)
  52. for i := 0; i < clusterSize; i++ {
  53. etcds[i], err = os.StartProcess(EtcdBinPath, argGroup[i], procAttr)
  54. // See util.go for the reason to wait for server
  55. client := buildClient()
  56. err = WaitForServer("127.0.0.1:400"+strconv.Itoa(i+1), client, "http")
  57. if err != nil {
  58. t.Fatalf("node start error: %s", err)
  59. }
  60. }
  61. go Monitor(clusterSize, 1, leaderChan, all, stop)
  62. <-all
  63. <-leaderChan
  64. result, err := c.Set("foo", "bar", 0)
  65. if err != nil {
  66. t.Fatalf("Recovery error: %s", err)
  67. }
  68. if result.Node.ModifiedIndex != 17 {
  69. t.Fatalf("recovery failed! [%d/17]", result.Node.ModifiedIndex)
  70. }
  71. }
  72. // Create a five-node cluster
  73. // Kill all the nodes and restart
  74. func TestMultiNodeKillAllAndRecoveryWithStandbys(t *testing.T) {
  75. procAttr := new(os.ProcAttr)
  76. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  77. stop := make(chan bool)
  78. leaderChan := make(chan string, 1)
  79. all := make(chan bool, 1)
  80. clusterSize := 15
  81. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false)
  82. defer DestroyCluster(etcds)
  83. if err != nil {
  84. t.Fatal("cannot create cluster")
  85. }
  86. c := etcd.NewClient(nil)
  87. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  88. <-all
  89. <-leaderChan
  90. stop <- true
  91. c.SyncCluster()
  92. // Reconfigure with smaller active size (7 nodes) and wait for remove.
  93. resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":7}`))
  94. if !assert.Equal(t, resp.StatusCode, 200) {
  95. t.FailNow()
  96. }
  97. time.Sleep(2*server.ActiveMonitorTimeout + (1 * time.Second))
  98. // Verify that there is three machines in peer mode.
  99. result, err := c.Get("_etcd/machines", false, true)
  100. assert.NoError(t, err)
  101. assert.Equal(t, len(result.Node.Nodes), 7)
  102. // send set commands
  103. for i := 0; i < 2*clusterSize; i++ {
  104. // Test Set
  105. _, err := c.Set("foo", "bar", 0)
  106. if err != nil {
  107. panic(err)
  108. }
  109. }
  110. time.Sleep(time.Second)
  111. // kill all
  112. DestroyCluster(etcds)
  113. time.Sleep(time.Second)
  114. stop = make(chan bool)
  115. leaderChan = make(chan string, 1)
  116. all = make(chan bool, 1)
  117. time.Sleep(time.Second)
  118. for i := 0; i < clusterSize; i++ {
  119. etcds[i], err = os.StartProcess(EtcdBinPath, append(argGroup[i], "-peers="), procAttr)
  120. }
  121. time.Sleep(2 * time.Second)
  122. // send set commands
  123. for i := 0; i < 2*clusterSize; i++ {
  124. // Test Set
  125. _, err := c.Set("foo", "bar", 0)
  126. if err != nil {
  127. t.Fatalf("Recovery error: %s", err)
  128. }
  129. }
  130. // Verify that we have seven machines.
  131. result, err = c.Get("_etcd/machines", false, true)
  132. assert.NoError(t, err)
  133. assert.Equal(t, len(result.Node.Nodes), 7)
  134. }
  135. // Create a five nodes
  136. // Kill all the nodes and restart, then remove the leader
  137. func TestMultiNodeKillAllAndRecoveryAndRemoveLeader(t *testing.T) {
  138. procAttr := new(os.ProcAttr)
  139. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  140. stop := make(chan bool)
  141. leaderChan := make(chan string, 1)
  142. all := make(chan bool, 1)
  143. clusterSize := 5
  144. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false)
  145. defer DestroyCluster(etcds)
  146. if err != nil {
  147. t.Fatal("cannot create cluster")
  148. }
  149. c := etcd.NewClient(nil)
  150. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  151. <-all
  152. <-leaderChan
  153. stop <- true
  154. // It needs some time to sync current commits and write it to disk.
  155. // Or some instance may be restarted as a new peer, and we don't support
  156. // to connect back the old cluster that doesn't have majority alive
  157. // without log now.
  158. time.Sleep(time.Second)
  159. c.SyncCluster()
  160. // kill all
  161. DestroyCluster(etcds)
  162. time.Sleep(time.Second)
  163. stop = make(chan bool)
  164. leaderChan = make(chan string, 1)
  165. all = make(chan bool, 1)
  166. time.Sleep(time.Second)
  167. for i := 0; i < clusterSize; i++ {
  168. etcds[i], err = os.StartProcess(EtcdBinPath, argGroup[i], procAttr)
  169. }
  170. go Monitor(clusterSize, 1, leaderChan, all, stop)
  171. <-all
  172. leader := <-leaderChan
  173. _, err = c.Set("foo", "bar", 0)
  174. if err != nil {
  175. t.Fatalf("Recovery error: %s", err)
  176. }
  177. port, _ := strconv.Atoi(strings.Split(leader, ":")[2])
  178. num := port - 7000
  179. resp, _ := tests.Delete(leader+"/v2/admin/machines/node"+strconv.Itoa(num), "application/json", nil)
  180. if !assert.Equal(t, resp.StatusCode, 200) {
  181. t.FailNow()
  182. }
  183. // check the old leader is in standby mode now
  184. time.Sleep(time.Second)
  185. resp, _ = tests.Get(leader + "/name")
  186. assert.Equal(t, resp.StatusCode, 404)
  187. }