network_partition_test.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. // Copyright 2017 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // +build !cluster_proxy
  15. package integration
  16. import (
  17. "errors"
  18. "testing"
  19. "time"
  20. "github.com/coreos/etcd/clientv3"
  21. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  22. "github.com/coreos/etcd/integration"
  23. "github.com/coreos/etcd/pkg/testutil"
  24. "golang.org/x/net/context"
  25. )
  26. var errExpected = errors.New("expected error")
  27. // TestBalancerUnderNetworkPartitionPut tests when one member becomes isolated,
  28. // first Put request fails, and following retry succeeds with client balancer
  29. // switching to others.
  30. func TestBalancerUnderNetworkPartitionPut(t *testing.T) {
  31. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  32. _, err := cli.Put(ctx, "a", "b")
  33. if err == context.DeadlineExceeded || err == rpctypes.ErrTimeout {
  34. return errExpected
  35. }
  36. return err
  37. }, time.Second)
  38. }
  39. func TestBalancerUnderNetworkPartitionDelete(t *testing.T) {
  40. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  41. _, err := cli.Delete(ctx, "a")
  42. if err == context.DeadlineExceeded || err == rpctypes.ErrTimeout {
  43. return errExpected
  44. }
  45. return err
  46. }, time.Second)
  47. }
  48. func TestBalancerUnderNetworkPartitionTxn(t *testing.T) {
  49. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  50. _, err := cli.Txn(ctx).
  51. If(clientv3.Compare(clientv3.Version("foo"), "=", 0)).
  52. Then(clientv3.OpPut("foo", "bar")).
  53. Else(clientv3.OpPut("foo", "baz")).Commit()
  54. if err == context.DeadlineExceeded || err == rpctypes.ErrTimeout {
  55. return errExpected
  56. }
  57. return err
  58. }, time.Second)
  59. }
  60. // TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout tests
  61. // when one member becomes isolated, first quorum Get request succeeds
  62. // by switching endpoints within the timeout (long enough to cover endpoint switch).
  63. func TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout(t *testing.T) {
  64. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  65. _, err := cli.Get(ctx, "a")
  66. return err
  67. }, 7*time.Second)
  68. }
  69. // TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout tests
  70. // when one member becomes isolated, first quorum Get request fails,
  71. // and following retry succeeds with client balancer switching to others.
  72. func TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout(t *testing.T) {
  73. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  74. _, err := cli.Get(ctx, "a")
  75. if err == context.DeadlineExceeded {
  76. return errExpected
  77. }
  78. return err
  79. }, time.Second)
  80. }
  81. func TestBalancerUnderNetworkPartitionSerializableGet(t *testing.T) {
  82. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  83. _, err := cli.Get(ctx, "a", clientv3.WithSerializable())
  84. return err
  85. }, time.Second)
  86. }
  87. func testBalancerUnderNetworkPartition(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) {
  88. defer testutil.AfterTest(t)
  89. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  90. Size: 3,
  91. GRPCKeepAliveMinTime: time.Millisecond, // avoid too_many_pings
  92. SkipCreatingClient: true,
  93. })
  94. defer clus.Terminate(t)
  95. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  96. // expect pin eps[0]
  97. ccfg := clientv3.Config{
  98. Endpoints: []string{eps[0]},
  99. DialTimeout: 3 * time.Second,
  100. }
  101. cli, err := clientv3.New(ccfg)
  102. if err != nil {
  103. t.Fatal(err)
  104. }
  105. defer cli.Close()
  106. // wait for eps[0] to be pinned
  107. mustWaitPinReady(t, cli)
  108. // add other endpoints for later endpoint switch
  109. cli.SetEndpoints(eps...)
  110. clus.Members[0].InjectPartition(t, clus.Members[1:]...)
  111. for i := 0; i < 2; i++ {
  112. ctx, cancel := context.WithTimeout(context.Background(), timeout)
  113. err = op(cli, ctx)
  114. cancel()
  115. if err == nil {
  116. break
  117. }
  118. if err != errExpected {
  119. t.Errorf("#%d: expected %v, got %v", i, errExpected, err)
  120. }
  121. // give enough time for endpoint switch
  122. // TODO: remove random sleep by syncing directly with balancer
  123. if i == 0 {
  124. time.Sleep(5 * time.Second)
  125. }
  126. }
  127. if err != nil {
  128. t.Errorf("balancer did not switch in time (%v)", err)
  129. }
  130. }
  131. // TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection ensures balancer
  132. // switches endpoint when leader fails and linearizable get requests returns
  133. // "etcdserver: request timed out".
  134. func TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection(t *testing.T) {
  135. defer testutil.AfterTest(t)
  136. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  137. Size: 3,
  138. SkipCreatingClient: true,
  139. })
  140. defer clus.Terminate(t)
  141. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  142. lead := clus.WaitLeader(t)
  143. timeout := 3 * clus.Members[(lead+1)%2].ServerConfig.ReqTimeout()
  144. cli, err := clientv3.New(clientv3.Config{
  145. Endpoints: []string{eps[(lead+1)%2]},
  146. DialTimeout: 1 * time.Second,
  147. })
  148. if err != nil {
  149. t.Fatal(err)
  150. }
  151. defer cli.Close()
  152. // wait for non-leader to be pinned
  153. mustWaitPinReady(t, cli)
  154. // add all eps to list, so that when the original pined one fails
  155. // the client can switch to other available eps
  156. cli.SetEndpoints(eps[lead], eps[(lead+1)%2])
  157. // isolate leader
  158. clus.Members[lead].InjectPartition(t, clus.Members[(lead+1)%3], clus.Members[(lead+2)%3])
  159. // expects balancer endpoint switch while ongoing leader election
  160. ctx, cancel := context.WithTimeout(context.TODO(), timeout)
  161. _, err = cli.Get(ctx, "a")
  162. cancel()
  163. if err != nil {
  164. t.Fatal(err)
  165. }
  166. }
  167. func TestBalancerUnderNetworkPartitionWatchLeader(t *testing.T) {
  168. testBalancerUnderNetworkPartitionWatch(t, true)
  169. }
  170. func TestBalancerUnderNetworkPartitionWatchFollower(t *testing.T) {
  171. testBalancerUnderNetworkPartitionWatch(t, false)
  172. }
  173. // testBalancerUnderNetworkPartitionWatch ensures watch stream
  174. // to a partitioned node be closed when context requires leader.
  175. func testBalancerUnderNetworkPartitionWatch(t *testing.T, isolateLeader bool) {
  176. defer testutil.AfterTest(t)
  177. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  178. Size: 3,
  179. SkipCreatingClient: true,
  180. })
  181. defer clus.Terminate(t)
  182. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  183. target := clus.WaitLeader(t)
  184. if !isolateLeader {
  185. target = (target + 1) % 3
  186. }
  187. // pin eps[target]
  188. watchCli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[target]}})
  189. if err != nil {
  190. t.Fatal(err)
  191. }
  192. defer watchCli.Close()
  193. // wait for eps[target] to be pinned
  194. mustWaitPinReady(t, watchCli)
  195. // add all eps to list, so that when the original pined one fails
  196. // the client can switch to other available eps
  197. watchCli.SetEndpoints(eps...)
  198. wch := watchCli.Watch(clientv3.WithRequireLeader(context.Background()), "foo", clientv3.WithCreatedNotify())
  199. select {
  200. case <-wch:
  201. case <-time.After(3 * time.Second):
  202. t.Fatal("took too long to create watch")
  203. }
  204. // isolate eps[target]
  205. clus.Members[target].InjectPartition(t,
  206. clus.Members[(target+1)%3],
  207. clus.Members[(target+2)%3],
  208. )
  209. select {
  210. case ev := <-wch:
  211. if len(ev.Events) != 0 {
  212. t.Fatal("expected no event")
  213. }
  214. if err = ev.Err(); err != rpctypes.ErrNoLeader {
  215. t.Fatalf("expected %v, got %v", rpctypes.ErrNoLeader, err)
  216. }
  217. case <-time.After(3 * time.Second): // enough time to detect leader lost
  218. t.Fatal("took too long to detect leader lost")
  219. }
  220. }