network_partition_test.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. // Copyright 2017 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // +build !cluster_proxy
  15. package integration
  16. import (
  17. "context"
  18. "errors"
  19. "strings"
  20. "testing"
  21. "time"
  22. "github.com/coreos/etcd/clientv3"
  23. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  24. "github.com/coreos/etcd/integration"
  25. "github.com/coreos/etcd/pkg/testutil"
  26. "google.golang.org/grpc/codes"
  27. "google.golang.org/grpc/status"
  28. )
  29. var errExpected = errors.New("expected error")
  30. // TestBalancerUnderNetworkPartitionPut tests when one member becomes isolated,
  31. // first Put request fails, and following retry succeeds with client balancer
  32. // switching to others.
  33. func TestBalancerUnderNetworkPartitionPut(t *testing.T) {
  34. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  35. _, err := cli.Put(ctx, "a", "b")
  36. if err == context.DeadlineExceeded || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout {
  37. return errExpected
  38. }
  39. return err
  40. }, time.Second)
  41. }
  42. func TestBalancerUnderNetworkPartitionDelete(t *testing.T) {
  43. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  44. _, err := cli.Delete(ctx, "a")
  45. if err == context.DeadlineExceeded || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout {
  46. return errExpected
  47. }
  48. return err
  49. }, time.Second)
  50. }
  51. func TestBalancerUnderNetworkPartitionTxn(t *testing.T) {
  52. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  53. _, err := cli.Txn(ctx).
  54. If(clientv3.Compare(clientv3.Version("foo"), "=", 0)).
  55. Then(clientv3.OpPut("foo", "bar")).
  56. Else(clientv3.OpPut("foo", "baz")).Commit()
  57. if err == context.DeadlineExceeded || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout {
  58. return errExpected
  59. }
  60. return err
  61. }, time.Second)
  62. }
  63. // TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout tests
  64. // when one member becomes isolated, first quorum Get request succeeds
  65. // by switching endpoints within the timeout (long enough to cover endpoint switch).
  66. func TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout(t *testing.T) {
  67. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  68. _, err := cli.Get(ctx, "a")
  69. return err
  70. }, 7*time.Second)
  71. }
  72. // TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout tests
  73. // when one member becomes isolated, first quorum Get request fails,
  74. // and following retry succeeds with client balancer switching to others.
  75. func TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout(t *testing.T) {
  76. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  77. _, err := cli.Get(ctx, "a")
  78. if err == context.DeadlineExceeded || isServerCtxTimeout(err) {
  79. return errExpected
  80. }
  81. return err
  82. }, time.Second)
  83. }
  84. // e.g. due to clock drifts in server-side,
  85. // client context times out first in server-side
  86. // while original client-side context is not timed out yet
  87. func isServerCtxTimeout(err error) bool {
  88. if err == nil {
  89. return false
  90. }
  91. ev, _ := status.FromError(err)
  92. code := ev.Code()
  93. return code == codes.DeadlineExceeded && strings.Contains(err.Error(), "context deadline exceeded")
  94. }
  95. func TestBalancerUnderNetworkPartitionSerializableGet(t *testing.T) {
  96. testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
  97. _, err := cli.Get(ctx, "a", clientv3.WithSerializable())
  98. return err
  99. }, time.Second)
  100. }
  101. func testBalancerUnderNetworkPartition(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) {
  102. defer testutil.AfterTest(t)
  103. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  104. Size: 3,
  105. SkipCreatingClient: true,
  106. })
  107. defer clus.Terminate(t)
  108. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  109. // expect pin eps[0]
  110. ccfg := clientv3.Config{
  111. Endpoints: []string{eps[0]},
  112. DialTimeout: 3 * time.Second,
  113. }
  114. cli, err := clientv3.New(ccfg)
  115. if err != nil {
  116. t.Fatal(err)
  117. }
  118. defer cli.Close()
  119. // wait for eps[0] to be pinned
  120. mustWaitPinReady(t, cli)
  121. // add other endpoints for later endpoint switch
  122. cli.SetEndpoints(eps...)
  123. clus.Members[0].InjectPartition(t, clus.Members[1:]...)
  124. for i := 0; i < 2; i++ {
  125. ctx, cancel := context.WithTimeout(context.Background(), timeout)
  126. err = op(cli, ctx)
  127. cancel()
  128. if err == nil {
  129. break
  130. }
  131. if err != errExpected {
  132. t.Errorf("#%d: expected %v, got %v", i, errExpected, err)
  133. }
  134. // give enough time for endpoint switch
  135. // TODO: remove random sleep by syncing directly with balancer
  136. if i == 0 {
  137. time.Sleep(5 * time.Second)
  138. }
  139. }
  140. if err != nil {
  141. t.Errorf("balancer did not switch in time (%v)", err)
  142. }
  143. }
  144. // TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection ensures balancer
  145. // switches endpoint when leader fails and linearizable get requests returns
  146. // "etcdserver: request timed out".
  147. func TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection(t *testing.T) {
  148. defer testutil.AfterTest(t)
  149. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  150. Size: 3,
  151. SkipCreatingClient: true,
  152. })
  153. defer clus.Terminate(t)
  154. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  155. lead := clus.WaitLeader(t)
  156. timeout := 3 * clus.Members[(lead+1)%2].ServerConfig.ReqTimeout()
  157. cli, err := clientv3.New(clientv3.Config{
  158. Endpoints: []string{eps[(lead+1)%2]},
  159. DialTimeout: 1 * time.Second,
  160. })
  161. if err != nil {
  162. t.Fatal(err)
  163. }
  164. defer cli.Close()
  165. // wait for non-leader to be pinned
  166. mustWaitPinReady(t, cli)
  167. // add all eps to list, so that when the original pined one fails
  168. // the client can switch to other available eps
  169. cli.SetEndpoints(eps[lead], eps[(lead+1)%2])
  170. // isolate leader
  171. clus.Members[lead].InjectPartition(t, clus.Members[(lead+1)%3], clus.Members[(lead+2)%3])
  172. // expects balancer endpoint switch while ongoing leader election
  173. ctx, cancel := context.WithTimeout(context.TODO(), timeout)
  174. _, err = cli.Get(ctx, "a")
  175. cancel()
  176. if err != nil {
  177. t.Fatal(err)
  178. }
  179. }
  180. func TestBalancerUnderNetworkPartitionWatchLeader(t *testing.T) {
  181. testBalancerUnderNetworkPartitionWatch(t, true)
  182. }
  183. func TestBalancerUnderNetworkPartitionWatchFollower(t *testing.T) {
  184. testBalancerUnderNetworkPartitionWatch(t, false)
  185. }
  186. // testBalancerUnderNetworkPartitionWatch ensures watch stream
  187. // to a partitioned node be closed when context requires leader.
  188. func testBalancerUnderNetworkPartitionWatch(t *testing.T, isolateLeader bool) {
  189. defer testutil.AfterTest(t)
  190. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  191. Size: 3,
  192. SkipCreatingClient: true,
  193. })
  194. defer clus.Terminate(t)
  195. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  196. target := clus.WaitLeader(t)
  197. if !isolateLeader {
  198. target = (target + 1) % 3
  199. }
  200. // pin eps[target]
  201. watchCli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[target]}})
  202. if err != nil {
  203. t.Fatal(err)
  204. }
  205. defer watchCli.Close()
  206. // wait for eps[target] to be pinned
  207. mustWaitPinReady(t, watchCli)
  208. // add all eps to list, so that when the original pined one fails
  209. // the client can switch to other available eps
  210. watchCli.SetEndpoints(eps...)
  211. wch := watchCli.Watch(clientv3.WithRequireLeader(context.Background()), "foo", clientv3.WithCreatedNotify())
  212. select {
  213. case <-wch:
  214. case <-time.After(3 * time.Second):
  215. t.Fatal("took too long to create watch")
  216. }
  217. // isolate eps[target]
  218. clus.Members[target].InjectPartition(t,
  219. clus.Members[(target+1)%3],
  220. clus.Members[(target+2)%3],
  221. )
  222. select {
  223. case ev := <-wch:
  224. if len(ev.Events) != 0 {
  225. t.Fatal("expected no event")
  226. }
  227. if err = ev.Err(); err != rpctypes.ErrNoLeader {
  228. t.Fatalf("expected %v, got %v", rpctypes.ErrNoLeader, err)
  229. }
  230. case <-time.After(3 * time.Second): // enough time to detect leader lost
  231. t.Fatal("took too long to detect leader lost")
  232. }
  233. }