server_shutdown_test.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. // Copyright 2017 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package integration
  15. import (
  16. "bytes"
  17. "context"
  18. "reflect"
  19. "strings"
  20. "testing"
  21. "time"
  22. "github.com/coreos/etcd/clientv3"
  23. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  24. "github.com/coreos/etcd/integration"
  25. "github.com/coreos/etcd/pkg/testutil"
  26. "google.golang.org/grpc/codes"
  27. "google.golang.org/grpc/status"
  28. )
  29. // TestBalancerUnderServerShutdownWatch expects that watch client
  30. // switch its endpoints when the member of the pinned endpoint fails.
  31. func TestBalancerUnderServerShutdownWatch(t *testing.T) {
  32. defer testutil.AfterTest(t)
  33. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  34. Size: 3,
  35. SkipCreatingClient: true,
  36. })
  37. defer clus.Terminate(t)
  38. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  39. lead := clus.WaitLeader(t)
  40. // pin eps[lead]
  41. watchCli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[lead]}})
  42. if err != nil {
  43. t.Fatal(err)
  44. }
  45. defer watchCli.Close()
  46. // wait for eps[lead] to be pinned
  47. mustWaitPinReady(t, watchCli)
  48. // add all eps to list, so that when the original pined one fails
  49. // the client can switch to other available eps
  50. watchCli.SetEndpoints(eps...)
  51. key, val := "foo", "bar"
  52. wch := watchCli.Watch(context.Background(), key, clientv3.WithCreatedNotify())
  53. select {
  54. case <-wch:
  55. case <-time.After(integration.RequestWaitTimeout):
  56. t.Fatal("took too long to create watch")
  57. }
  58. donec := make(chan struct{})
  59. go func() {
  60. defer close(donec)
  61. // switch to others when eps[lead] is shut down
  62. select {
  63. case ev := <-wch:
  64. if werr := ev.Err(); werr != nil {
  65. t.Fatal(werr)
  66. }
  67. if len(ev.Events) != 1 {
  68. t.Fatalf("expected one event, got %+v", ev)
  69. }
  70. if !bytes.Equal(ev.Events[0].Kv.Value, []byte(val)) {
  71. t.Fatalf("expected %q, got %+v", val, ev.Events[0].Kv)
  72. }
  73. case <-time.After(7 * time.Second):
  74. t.Fatal("took too long to receive events")
  75. }
  76. }()
  77. // shut down eps[lead]
  78. clus.Members[lead].Terminate(t)
  79. // writes to eps[lead+1]
  80. putCli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[(lead+1)%3]}})
  81. if err != nil {
  82. t.Fatal(err)
  83. }
  84. defer putCli.Close()
  85. for {
  86. ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
  87. _, err = putCli.Put(ctx, key, val)
  88. cancel()
  89. if err == nil {
  90. break
  91. }
  92. if err == context.DeadlineExceeded || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout || err == rpctypes.ErrTimeoutDueToLeaderFail {
  93. continue
  94. }
  95. t.Fatal(err)
  96. }
  97. select {
  98. case <-donec:
  99. case <-time.After(5 * time.Second): // enough time for balancer switch
  100. t.Fatal("took too long to receive events")
  101. }
  102. }
  103. func TestBalancerUnderServerShutdownPut(t *testing.T) {
  104. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  105. _, err := cli.Put(ctx, "foo", "bar")
  106. return err
  107. })
  108. }
  109. func TestBalancerUnderServerShutdownDelete(t *testing.T) {
  110. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  111. _, err := cli.Delete(ctx, "foo")
  112. return err
  113. })
  114. }
  115. func TestBalancerUnderServerShutdownTxn(t *testing.T) {
  116. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  117. _, err := cli.Txn(ctx).
  118. If(clientv3.Compare(clientv3.Version("foo"), "=", 0)).
  119. Then(clientv3.OpPut("foo", "bar")).
  120. Else(clientv3.OpPut("foo", "baz")).Commit()
  121. return err
  122. })
  123. }
  124. // testBalancerUnderServerShutdownMutable expects that when the member of
  125. // the pinned endpoint is shut down, the balancer switches its endpoints
  126. // and all subsequent put/delete/txn requests succeed with new endpoints.
  127. func testBalancerUnderServerShutdownMutable(t *testing.T, op func(*clientv3.Client, context.Context) error) {
  128. defer testutil.AfterTest(t)
  129. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  130. Size: 3,
  131. SkipCreatingClient: true,
  132. })
  133. defer clus.Terminate(t)
  134. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  135. // pin eps[0]
  136. cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[0]}})
  137. if err != nil {
  138. t.Fatal(err)
  139. }
  140. defer cli.Close()
  141. // wait for eps[0] to be pinned
  142. mustWaitPinReady(t, cli)
  143. // add all eps to list, so that when the original pined one fails
  144. // the client can switch to other available eps
  145. cli.SetEndpoints(eps...)
  146. // shut down eps[0]
  147. clus.Members[0].Terminate(t)
  148. // switched to others when eps[0] was explicitly shut down
  149. // and following request should succeed
  150. // TODO: remove this (expose client connection state?)
  151. time.Sleep(time.Second)
  152. cctx, ccancel := context.WithTimeout(context.Background(), time.Second)
  153. err = op(cli, cctx)
  154. ccancel()
  155. if err != nil {
  156. t.Fatal(err)
  157. }
  158. }
  159. func TestBalancerUnderServerShutdownGetLinearizable(t *testing.T) {
  160. testBalancerUnderServerShutdownImmutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  161. _, err := cli.Get(ctx, "foo")
  162. return err
  163. }, 7*time.Second) // give enough time for leader election, balancer switch
  164. }
  165. func TestBalancerUnderServerShutdownGetSerializable(t *testing.T) {
  166. testBalancerUnderServerShutdownImmutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  167. _, err := cli.Get(ctx, "foo", clientv3.WithSerializable())
  168. return err
  169. }, 2*time.Second)
  170. }
  171. // testBalancerUnderServerShutdownImmutable expects that when the member of
  172. // the pinned endpoint is shut down, the balancer switches its endpoints
  173. // and all subsequent range requests succeed with new endpoints.
  174. func testBalancerUnderServerShutdownImmutable(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) {
  175. defer testutil.AfterTest(t)
  176. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  177. Size: 3,
  178. SkipCreatingClient: true,
  179. })
  180. defer clus.Terminate(t)
  181. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  182. // pin eps[0]
  183. cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[0]}})
  184. if err != nil {
  185. t.Errorf("failed to create client: %v", err)
  186. }
  187. defer cli.Close()
  188. // wait for eps[0] to be pinned
  189. mustWaitPinReady(t, cli)
  190. // add all eps to list, so that when the original pined one fails
  191. // the client can switch to other available eps
  192. cli.SetEndpoints(eps...)
  193. // shut down eps[0]
  194. clus.Members[0].Terminate(t)
  195. // switched to others when eps[0] was explicitly shut down
  196. // and following request should succeed
  197. cctx, ccancel := context.WithTimeout(context.Background(), timeout)
  198. err = op(cli, cctx)
  199. ccancel()
  200. if err != nil {
  201. t.Errorf("failed to finish range request in time %v (timeout %v)", err, timeout)
  202. }
  203. }
  204. func TestBalancerUnderServerStopInflightLinearizableGetOnRestart(t *testing.T) {
  205. tt := []pinTestOpt{
  206. {pinLeader: true, stopPinFirst: true},
  207. {pinLeader: true, stopPinFirst: false},
  208. {pinLeader: false, stopPinFirst: true},
  209. {pinLeader: false, stopPinFirst: false},
  210. }
  211. for i := range tt {
  212. testBalancerUnderServerStopInflightRangeOnRestart(t, true, tt[i])
  213. }
  214. }
  215. func TestBalancerUnderServerStopInflightSerializableGetOnRestart(t *testing.T) {
  216. tt := []pinTestOpt{
  217. {pinLeader: true, stopPinFirst: true},
  218. {pinLeader: true, stopPinFirst: false},
  219. {pinLeader: false, stopPinFirst: true},
  220. {pinLeader: false, stopPinFirst: false},
  221. }
  222. for i := range tt {
  223. testBalancerUnderServerStopInflightRangeOnRestart(t, false, tt[i])
  224. }
  225. }
  226. type pinTestOpt struct {
  227. pinLeader bool
  228. stopPinFirst bool
  229. }
  230. // testBalancerUnderServerStopInflightRangeOnRestart expects
  231. // inflight range request reconnects on server restart.
  232. func testBalancerUnderServerStopInflightRangeOnRestart(t *testing.T, linearizable bool, opt pinTestOpt) {
  233. defer testutil.AfterTest(t)
  234. cfg := &integration.ClusterConfig{
  235. Size: 2,
  236. SkipCreatingClient: true,
  237. }
  238. if linearizable {
  239. cfg.Size = 3
  240. }
  241. clus := integration.NewClusterV3(t, cfg)
  242. defer clus.Terminate(t)
  243. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr()}
  244. if linearizable {
  245. eps = append(eps, clus.Members[2].GRPCAddr())
  246. }
  247. lead := clus.WaitLeader(t)
  248. target := lead
  249. if !opt.pinLeader {
  250. target = (target + 1) % 2
  251. }
  252. // pin eps[target]
  253. cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[target]}})
  254. if err != nil {
  255. t.Errorf("failed to create client: %v", err)
  256. }
  257. defer cli.Close()
  258. // wait for eps[target] to be pinned
  259. mustWaitPinReady(t, cli)
  260. // add all eps to list, so that when the original pined one fails
  261. // the client can switch to other available eps
  262. cli.SetEndpoints(eps...)
  263. if opt.stopPinFirst {
  264. clus.Members[target].Stop(t)
  265. // give some time for balancer switch before stopping the other
  266. time.Sleep(time.Second)
  267. clus.Members[(target+1)%2].Stop(t)
  268. } else {
  269. clus.Members[(target+1)%2].Stop(t)
  270. // balancer cannot pin other member since it's already stopped
  271. clus.Members[target].Stop(t)
  272. }
  273. // 3-second is the minimum interval between endpoint being marked
  274. // as unhealthy and being removed from unhealthy, so possibly
  275. // takes >5-second to unpin and repin an endpoint
  276. // TODO: decrease timeout when balancer switch rewrite
  277. clientTimeout := 7 * time.Second
  278. var gops []clientv3.OpOption
  279. if !linearizable {
  280. gops = append(gops, clientv3.WithSerializable())
  281. }
  282. donec, readyc := make(chan struct{}), make(chan struct{}, 1)
  283. go func() {
  284. defer close(donec)
  285. ctx, cancel := context.WithTimeout(context.TODO(), clientTimeout)
  286. readyc <- struct{}{}
  287. _, err := cli.Get(ctx, "abc", gops...)
  288. cancel()
  289. if err != nil {
  290. if linearizable && strings.Contains(err.Error(), "context deadline exceeded") {
  291. t.Logf("TODO: FIX THIS after balancer rewrite! %v %v", reflect.TypeOf(err), err)
  292. } else {
  293. t.Fatal(err)
  294. }
  295. }
  296. }()
  297. <-readyc
  298. clus.Members[target].Restart(t)
  299. select {
  300. case <-time.After(clientTimeout + integration.RequestWaitTimeout):
  301. t.Fatalf("timed out waiting for Get [linearizable: %v, opt: %+v]", linearizable, opt)
  302. case <-donec:
  303. }
  304. }
  305. // e.g. due to clock drifts in server-side,
  306. // client context times out first in server-side
  307. // while original client-side context is not timed out yet
  308. func isServerCtxTimeout(err error) bool {
  309. if err == nil {
  310. return false
  311. }
  312. ev, ok := status.FromError(err)
  313. if !ok {
  314. return false
  315. }
  316. code := ev.Code()
  317. return code == codes.DeadlineExceeded && strings.Contains(err.Error(), "context deadline exceeded")
  318. }