server_shutdown_test.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. // Copyright 2017 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package integration
  15. import (
  16. "bytes"
  17. "context"
  18. "strings"
  19. "testing"
  20. "time"
  21. "github.com/coreos/etcd/clientv3"
  22. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  23. "github.com/coreos/etcd/integration"
  24. "github.com/coreos/etcd/pkg/testutil"
  25. "google.golang.org/grpc"
  26. "google.golang.org/grpc/codes"
  27. "google.golang.org/grpc/status"
  28. )
  29. // TestBalancerUnderServerShutdownWatch expects that watch client
  30. // switch its endpoints when the member of the pinned endpoint fails.
  31. func TestBalancerUnderServerShutdownWatch(t *testing.T) {
  32. defer testutil.AfterTest(t)
  33. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  34. Size: 3,
  35. SkipCreatingClient: true,
  36. })
  37. defer clus.Terminate(t)
  38. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  39. lead := clus.WaitLeader(t)
  40. // pin eps[lead]
  41. watchCli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[lead]}})
  42. if err != nil {
  43. t.Fatal(err)
  44. }
  45. defer watchCli.Close()
  46. // wait for eps[lead] to be pinned
  47. mustWaitPinReady(t, watchCli)
  48. // add all eps to list, so that when the original pined one fails
  49. // the client can switch to other available eps
  50. watchCli.SetEndpoints(eps...)
  51. key, val := "foo", "bar"
  52. wch := watchCli.Watch(context.Background(), key, clientv3.WithCreatedNotify())
  53. select {
  54. case <-wch:
  55. case <-time.After(integration.RequestWaitTimeout):
  56. t.Fatal("took too long to create watch")
  57. }
  58. donec := make(chan struct{})
  59. go func() {
  60. defer close(donec)
  61. // switch to others when eps[lead] is shut down
  62. select {
  63. case ev := <-wch:
  64. if werr := ev.Err(); werr != nil {
  65. t.Error(werr)
  66. }
  67. if len(ev.Events) != 1 {
  68. t.Errorf("expected one event, got %+v", ev)
  69. }
  70. if !bytes.Equal(ev.Events[0].Kv.Value, []byte(val)) {
  71. t.Errorf("expected %q, got %+v", val, ev.Events[0].Kv)
  72. }
  73. case <-time.After(7 * time.Second):
  74. t.Error("took too long to receive events")
  75. }
  76. }()
  77. // shut down eps[lead]
  78. clus.Members[lead].Terminate(t)
  79. // writes to eps[lead+1]
  80. putCli, err := clientv3.New(clientv3.Config{
  81. Endpoints: []string{eps[(lead+1)%3]},
  82. DialOptions: []grpc.DialOption{grpc.WithBlock()},
  83. })
  84. if err != nil {
  85. t.Fatal(err)
  86. }
  87. defer putCli.Close()
  88. for {
  89. ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
  90. _, err = putCli.Put(ctx, key, val)
  91. cancel()
  92. if err == nil {
  93. break
  94. }
  95. if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout || err == rpctypes.ErrTimeoutDueToLeaderFail {
  96. continue
  97. }
  98. t.Fatal(err)
  99. }
  100. select {
  101. case <-donec:
  102. case <-time.After(5 * time.Second): // enough time for balancer switch
  103. t.Fatal("took too long to receive events")
  104. }
  105. }
  106. func TestBalancerUnderServerShutdownPut(t *testing.T) {
  107. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  108. _, err := cli.Put(ctx, "foo", "bar")
  109. return err
  110. })
  111. }
  112. func TestBalancerUnderServerShutdownDelete(t *testing.T) {
  113. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  114. _, err := cli.Delete(ctx, "foo")
  115. return err
  116. })
  117. }
  118. func TestBalancerUnderServerShutdownTxn(t *testing.T) {
  119. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  120. _, err := cli.Txn(ctx).
  121. If(clientv3.Compare(clientv3.Version("foo"), "=", 0)).
  122. Then(clientv3.OpPut("foo", "bar")).
  123. Else(clientv3.OpPut("foo", "baz")).Commit()
  124. return err
  125. })
  126. }
  127. // testBalancerUnderServerShutdownMutable expects that when the member of
  128. // the pinned endpoint is shut down, the balancer switches its endpoints
  129. // and all subsequent put/delete/txn requests succeed with new endpoints.
  130. func testBalancerUnderServerShutdownMutable(t *testing.T, op func(*clientv3.Client, context.Context) error) {
  131. defer testutil.AfterTest(t)
  132. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  133. Size: 3,
  134. SkipCreatingClient: true,
  135. })
  136. defer clus.Terminate(t)
  137. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  138. // pin eps[0]
  139. cli, err := clientv3.New(clientv3.Config{
  140. Endpoints: []string{eps[0]},
  141. DialOptions: []grpc.DialOption{grpc.WithBlock()},
  142. })
  143. if err != nil {
  144. t.Fatal(err)
  145. }
  146. defer cli.Close()
  147. // wait for eps[0] to be pinned
  148. mustWaitPinReady(t, cli)
  149. // add all eps to list, so that when the original pined one fails
  150. // the client can switch to other available eps
  151. cli.SetEndpoints(eps...)
  152. // shut down eps[0]
  153. clus.Members[0].Terminate(t)
  154. // switched to others when eps[0] was explicitly shut down
  155. // and following request should succeed
  156. // TODO: remove this (expose client connection state?)
  157. time.Sleep(time.Second)
  158. cctx, ccancel := context.WithTimeout(context.Background(), time.Second)
  159. err = op(cli, cctx)
  160. ccancel()
  161. if err != nil {
  162. t.Fatal(err)
  163. }
  164. }
  165. func TestBalancerUnderServerShutdownGetLinearizable(t *testing.T) {
  166. testBalancerUnderServerShutdownImmutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  167. _, err := cli.Get(ctx, "foo")
  168. return err
  169. }, 7*time.Second) // give enough time for leader election, balancer switch
  170. }
  171. func TestBalancerUnderServerShutdownGetSerializable(t *testing.T) {
  172. testBalancerUnderServerShutdownImmutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  173. _, err := cli.Get(ctx, "foo", clientv3.WithSerializable())
  174. return err
  175. }, 2*time.Second)
  176. }
  177. // testBalancerUnderServerShutdownImmutable expects that when the member of
  178. // the pinned endpoint is shut down, the balancer switches its endpoints
  179. // and all subsequent range requests succeed with new endpoints.
  180. func testBalancerUnderServerShutdownImmutable(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) {
  181. defer testutil.AfterTest(t)
  182. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  183. Size: 3,
  184. SkipCreatingClient: true,
  185. })
  186. defer clus.Terminate(t)
  187. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  188. // pin eps[0]
  189. cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[0]}})
  190. if err != nil {
  191. t.Errorf("failed to create client: %v", err)
  192. }
  193. defer cli.Close()
  194. // wait for eps[0] to be pinned
  195. mustWaitPinReady(t, cli)
  196. // add all eps to list, so that when the original pined one fails
  197. // the client can switch to other available eps
  198. cli.SetEndpoints(eps...)
  199. // shut down eps[0]
  200. clus.Members[0].Terminate(t)
  201. // switched to others when eps[0] was explicitly shut down
  202. // and following request should succeed
  203. cctx, ccancel := context.WithTimeout(context.Background(), timeout)
  204. err = op(cli, cctx)
  205. ccancel()
  206. if err != nil {
  207. t.Errorf("failed to finish range request in time %v (timeout %v)", err, timeout)
  208. }
  209. }
  210. func TestBalancerUnderServerStopInflightLinearizableGetOnRestart(t *testing.T) {
  211. tt := []pinTestOpt{
  212. {pinLeader: true, stopPinFirst: true},
  213. {pinLeader: true, stopPinFirst: false},
  214. {pinLeader: false, stopPinFirst: true},
  215. {pinLeader: false, stopPinFirst: false},
  216. }
  217. for i := range tt {
  218. testBalancerUnderServerStopInflightRangeOnRestart(t, true, tt[i])
  219. }
  220. }
  221. func TestBalancerUnderServerStopInflightSerializableGetOnRestart(t *testing.T) {
  222. tt := []pinTestOpt{
  223. {pinLeader: true, stopPinFirst: true},
  224. {pinLeader: true, stopPinFirst: false},
  225. {pinLeader: false, stopPinFirst: true},
  226. {pinLeader: false, stopPinFirst: false},
  227. }
  228. for i := range tt {
  229. testBalancerUnderServerStopInflightRangeOnRestart(t, false, tt[i])
  230. }
  231. }
  232. type pinTestOpt struct {
  233. pinLeader bool
  234. stopPinFirst bool
  235. }
  236. // testBalancerUnderServerStopInflightRangeOnRestart expects
  237. // inflight range request reconnects on server restart.
  238. func testBalancerUnderServerStopInflightRangeOnRestart(t *testing.T, linearizable bool, opt pinTestOpt) {
  239. defer testutil.AfterTest(t)
  240. cfg := &integration.ClusterConfig{
  241. Size: 2,
  242. SkipCreatingClient: true,
  243. }
  244. if linearizable {
  245. cfg.Size = 3
  246. }
  247. clus := integration.NewClusterV3(t, cfg)
  248. defer clus.Terminate(t)
  249. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr()}
  250. if linearizable {
  251. eps = append(eps, clus.Members[2].GRPCAddr())
  252. }
  253. lead := clus.WaitLeader(t)
  254. target := lead
  255. if !opt.pinLeader {
  256. target = (target + 1) % 2
  257. }
  258. // pin eps[target]
  259. cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[target]}})
  260. if err != nil {
  261. t.Errorf("failed to create client: %v", err)
  262. }
  263. defer cli.Close()
  264. // wait for eps[target] to be pinned
  265. mustWaitPinReady(t, cli)
  266. // add all eps to list, so that when the original pined one fails
  267. // the client can switch to other available eps
  268. cli.SetEndpoints(eps...)
  269. if opt.stopPinFirst {
  270. clus.Members[target].Stop(t)
  271. // give some time for balancer switch before stopping the other
  272. time.Sleep(time.Second)
  273. clus.Members[(target+1)%2].Stop(t)
  274. } else {
  275. clus.Members[(target+1)%2].Stop(t)
  276. // balancer cannot pin other member since it's already stopped
  277. clus.Members[target].Stop(t)
  278. }
  279. // 3-second is the minimum interval between endpoint being marked
  280. // as unhealthy and being removed from unhealthy, so possibly
  281. // takes >5-second to unpin and repin an endpoint
  282. // TODO: decrease timeout when balancer switch rewrite
  283. clientTimeout := 7 * time.Second
  284. var gops []clientv3.OpOption
  285. if !linearizable {
  286. gops = append(gops, clientv3.WithSerializable())
  287. }
  288. donec, readyc := make(chan struct{}), make(chan struct{}, 1)
  289. go func() {
  290. defer close(donec)
  291. ctx, cancel := context.WithTimeout(context.TODO(), clientTimeout)
  292. readyc <- struct{}{}
  293. // TODO: The new grpc load balancer will not pin to an endpoint
  294. // as intended by this test. But it will round robin member within
  295. // two attempts.
  296. // Remove retry loop once the new grpc load balancer provides retry.
  297. for i := 0; i < 2; i++ {
  298. _, err = cli.Get(ctx, "abc", gops...)
  299. if err == nil {
  300. break
  301. }
  302. }
  303. cancel()
  304. if err != nil {
  305. t.Errorf("unexpected error: %v", err)
  306. }
  307. }()
  308. <-readyc
  309. clus.Members[target].Restart(t)
  310. select {
  311. case <-time.After(clientTimeout + integration.RequestWaitTimeout):
  312. t.Fatalf("timed out waiting for Get [linearizable: %v, opt: %+v]", linearizable, opt)
  313. case <-donec:
  314. }
  315. }
  316. // e.g. due to clock drifts in server-side,
  317. // client context times out first in server-side
  318. // while original client-side context is not timed out yet
  319. func isServerCtxTimeout(err error) bool {
  320. if err == nil {
  321. return false
  322. }
  323. ev, ok := status.FromError(err)
  324. if !ok {
  325. return false
  326. }
  327. code := ev.Code()
  328. return code == codes.DeadlineExceeded && strings.Contains(err.Error(), "context deadline exceeded")
  329. }
  330. // In grpc v1.11.3+ dial timeouts can error out with transport.ErrConnClosing. Previously dial timeouts
  331. // would always error out with context.DeadlineExceeded.
  332. func isClientTimeout(err error) bool {
  333. if err == nil {
  334. return false
  335. }
  336. if err == context.DeadlineExceeded {
  337. return true
  338. }
  339. ev, ok := status.FromError(err)
  340. if !ok {
  341. return false
  342. }
  343. code := ev.Code()
  344. return code == codes.DeadlineExceeded
  345. }
  346. func isCanceled(err error) bool {
  347. if err == nil {
  348. return false
  349. }
  350. if err == context.Canceled {
  351. return true
  352. }
  353. ev, ok := status.FromError(err)
  354. if !ok {
  355. return false
  356. }
  357. code := ev.Code()
  358. return code == codes.Canceled
  359. }
  360. func isUnavailable(err error) bool {
  361. if err == nil {
  362. return false
  363. }
  364. if err == context.Canceled {
  365. return true
  366. }
  367. ev, ok := status.FromError(err)
  368. if !ok {
  369. return false
  370. }
  371. code := ev.Code()
  372. return code == codes.Unavailable
  373. }