multi_node_kill_all_and_recovery_test.go 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. package test
  2. import (
  3. "bytes"
  4. "os"
  5. "strconv"
  6. "strings"
  7. "testing"
  8. "time"
  9. "github.com/coreos/etcd/server"
  10. "github.com/coreos/etcd/tests"
  11. "github.com/coreos/etcd/third_party/github.com/coreos/go-etcd/etcd"
  12. "github.com/coreos/etcd/third_party/github.com/stretchr/testify/assert"
  13. )
  14. // Create a five nodes
  15. // Kill all the nodes and restart
  16. func TestMultiNodeKillAllAndRecovery(t *testing.T) {
  17. procAttr := new(os.ProcAttr)
  18. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  19. stop := make(chan bool)
  20. leaderChan := make(chan string, 1)
  21. all := make(chan bool, 1)
  22. clusterSize := 5
  23. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false)
  24. defer DestroyCluster(etcds)
  25. if err != nil {
  26. t.Fatal("cannot create cluster")
  27. }
  28. c := etcd.NewClient(nil)
  29. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  30. <-all
  31. <-leaderChan
  32. stop <- true
  33. c.SyncCluster()
  34. // send 10 commands
  35. for i := 0; i < 10; i++ {
  36. // Test Set
  37. _, err := c.Set("foo", "bar", 0)
  38. if err != nil {
  39. panic(err)
  40. }
  41. }
  42. time.Sleep(time.Second)
  43. // kill all
  44. DestroyCluster(etcds)
  45. time.Sleep(time.Second)
  46. stop = make(chan bool)
  47. leaderChan = make(chan string, 1)
  48. all = make(chan bool, 1)
  49. time.Sleep(time.Second)
  50. for i := 0; i < clusterSize; i++ {
  51. etcds[i], err = os.StartProcess(EtcdBinPath, argGroup[i], procAttr)
  52. }
  53. go Monitor(clusterSize, 1, leaderChan, all, stop)
  54. <-all
  55. <-leaderChan
  56. result, err := c.Set("foo", "bar", 0)
  57. if err != nil {
  58. t.Fatalf("Recovery error: %s", err)
  59. }
  60. if result.Node.ModifiedIndex != 17 {
  61. t.Fatalf("recovery failed! [%d/17]", result.Node.ModifiedIndex)
  62. }
  63. }
  64. // TestTLSMultiNodeKillAllAndRecovery create a five nodes
  65. // then kill all the nodes and restart
  66. func TestTLSMultiNodeKillAllAndRecovery(t *testing.T) {
  67. procAttr := new(os.ProcAttr)
  68. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  69. stop := make(chan bool)
  70. leaderChan := make(chan string, 1)
  71. all := make(chan bool, 1)
  72. clusterSize := 5
  73. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, true)
  74. defer DestroyCluster(etcds)
  75. if err != nil {
  76. t.Fatal("cannot create cluster")
  77. }
  78. time.Sleep(time.Second)
  79. c := etcd.NewClient(nil)
  80. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  81. <-all
  82. <-leaderChan
  83. stop <- true
  84. c.SyncCluster()
  85. // send 10 commands
  86. for i := 0; i < 10; i++ {
  87. // Test Set
  88. _, err := c.Set("foo", "bar", 0)
  89. if err != nil {
  90. panic(err)
  91. }
  92. }
  93. time.Sleep(time.Second)
  94. // kill all
  95. DestroyCluster(etcds)
  96. time.Sleep(time.Second)
  97. stop = make(chan bool)
  98. leaderChan = make(chan string, 1)
  99. all = make(chan bool, 1)
  100. time.Sleep(time.Second)
  101. for i := 0; i < clusterSize; i++ {
  102. etcds[i], err = os.StartProcess(EtcdBinPath, argGroup[i], procAttr)
  103. // See util.go for the reason to wait for server
  104. client := buildClient()
  105. err = WaitForServer("127.0.0.1:400"+strconv.Itoa(i+1), client, "http")
  106. if err != nil {
  107. t.Fatalf("node start error: %s", err)
  108. }
  109. }
  110. go Monitor(clusterSize, 1, leaderChan, all, stop)
  111. <-all
  112. <-leaderChan
  113. result, err := c.Set("foo", "bar", 0)
  114. if err != nil {
  115. t.Fatalf("Recovery error: %s", err)
  116. }
  117. if result.Node.ModifiedIndex != 17 {
  118. t.Fatalf("recovery failed! [%d/17]", result.Node.ModifiedIndex)
  119. }
  120. }
  121. // Create a five-node cluster
  122. // Kill all the nodes and restart
  123. func TestMultiNodeKillAllAndRecoveryWithStandbys(t *testing.T) {
  124. procAttr := new(os.ProcAttr)
  125. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  126. stop := make(chan bool)
  127. leaderChan := make(chan string, 1)
  128. all := make(chan bool, 1)
  129. clusterSize := 15
  130. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false)
  131. defer DestroyCluster(etcds)
  132. if err != nil {
  133. t.Fatal("cannot create cluster")
  134. }
  135. c := etcd.NewClient(nil)
  136. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  137. <-all
  138. <-leaderChan
  139. stop <- true
  140. c.SyncCluster()
  141. // Reconfigure with smaller active size (7 nodes) and wait for remove.
  142. resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":7}`))
  143. if !assert.Equal(t, resp.StatusCode, 200) {
  144. t.FailNow()
  145. }
  146. time.Sleep(2*server.ActiveMonitorTimeout + (1 * time.Second))
  147. // Verify that there is three machines in peer mode.
  148. result, err := c.Get("_etcd/machines", false, true)
  149. assert.NoError(t, err)
  150. assert.Equal(t, len(result.Node.Nodes), 7)
  151. // send set commands
  152. for i := 0; i < 2*clusterSize; i++ {
  153. // Test Set
  154. _, err := c.Set("foo", "bar", 0)
  155. if err != nil {
  156. panic(err)
  157. }
  158. }
  159. time.Sleep(time.Second)
  160. // kill all
  161. DestroyCluster(etcds)
  162. time.Sleep(time.Second)
  163. stop = make(chan bool)
  164. leaderChan = make(chan string, 1)
  165. all = make(chan bool, 1)
  166. time.Sleep(time.Second)
  167. for i := 0; i < clusterSize; i++ {
  168. etcds[i], err = os.StartProcess(EtcdBinPath, append(argGroup[i], "-peers="), procAttr)
  169. }
  170. time.Sleep(2 * time.Second)
  171. // send set commands
  172. for i := 0; i < 2*clusterSize; i++ {
  173. // Test Set
  174. _, err := c.Set("foo", "bar", 0)
  175. if err != nil {
  176. t.Fatalf("Recovery error: %s", err)
  177. }
  178. }
  179. // Verify that we have seven machines.
  180. result, err = c.Get("_etcd/machines", false, true)
  181. assert.NoError(t, err)
  182. assert.Equal(t, len(result.Node.Nodes), 7)
  183. }
  184. // Create a five nodes
  185. // Kill all the nodes and restart, then remove the leader
  186. func TestMultiNodeKillAllAndRecoveryAndRemoveLeader(t *testing.T) {
  187. procAttr := new(os.ProcAttr)
  188. procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr}
  189. stop := make(chan bool)
  190. leaderChan := make(chan string, 1)
  191. all := make(chan bool, 1)
  192. clusterSize := 5
  193. argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false)
  194. defer DestroyCluster(etcds)
  195. if err != nil {
  196. t.Fatal("cannot create cluster")
  197. }
  198. c := etcd.NewClient(nil)
  199. go Monitor(clusterSize, clusterSize, leaderChan, all, stop)
  200. <-all
  201. <-leaderChan
  202. stop <- true
  203. // It needs some time to sync current commits and write it to disk.
  204. // Or some instance may be restarted as a new peer, and we don't support
  205. // to connect back the old cluster that doesn't have majority alive
  206. // without log now.
  207. time.Sleep(time.Second)
  208. c.SyncCluster()
  209. // kill all
  210. DestroyCluster(etcds)
  211. time.Sleep(time.Second)
  212. stop = make(chan bool)
  213. leaderChan = make(chan string, 1)
  214. all = make(chan bool, 1)
  215. time.Sleep(time.Second)
  216. for i := 0; i < clusterSize; i++ {
  217. etcds[i], err = os.StartProcess(EtcdBinPath, argGroup[i], procAttr)
  218. }
  219. go Monitor(clusterSize, 1, leaderChan, all, stop)
  220. <-all
  221. leader := <-leaderChan
  222. _, err = c.Set("foo", "bar", 0)
  223. if err != nil {
  224. t.Fatalf("Recovery error: %s", err)
  225. }
  226. port, _ := strconv.Atoi(strings.Split(leader, ":")[2])
  227. num := port - 7000
  228. resp, _ := tests.Delete(leader+"/v2/admin/machines/node"+strconv.Itoa(num), "application/json", nil)
  229. if !assert.Equal(t, resp.StatusCode, 200) {
  230. t.FailNow()
  231. }
  232. // check the old leader is in standby mode now
  233. time.Sleep(time.Second)
  234. resp, _ = tests.Get(leader + "/name")
  235. assert.Equal(t, resp.StatusCode, 404)
  236. }