server_shutdown_test.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. // Copyright 2017 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package integration
  15. import (
  16. "bytes"
  17. "context"
  18. "strings"
  19. "testing"
  20. "time"
  21. "go.etcd.io/etcd/clientv3"
  22. "go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes"
  23. "go.etcd.io/etcd/integration"
  24. "go.etcd.io/etcd/pkg/testutil"
  25. "google.golang.org/grpc/codes"
  26. "google.golang.org/grpc/status"
  27. )
  28. // TestBalancerUnderServerShutdownWatch expects that watch client
  29. // switch its endpoints when the member of the pinned endpoint fails.
  30. func TestBalancerUnderServerShutdownWatch(t *testing.T) {
  31. defer testutil.AfterTest(t)
  32. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  33. Size: 3,
  34. SkipCreatingClient: true,
  35. })
  36. defer clus.Terminate(t)
  37. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  38. lead := clus.WaitLeader(t)
  39. // pin eps[lead]
  40. watchCli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[lead]}})
  41. if err != nil {
  42. t.Fatal(err)
  43. }
  44. defer watchCli.Close()
  45. // wait for eps[lead] to be pinned
  46. mustWaitPinReady(t, watchCli)
  47. // add all eps to list, so that when the original pined one fails
  48. // the client can switch to other available eps
  49. watchCli.SetEndpoints(eps...)
  50. key, val := "foo", "bar"
  51. wch := watchCli.Watch(context.Background(), key, clientv3.WithCreatedNotify())
  52. select {
  53. case <-wch:
  54. case <-time.After(integration.RequestWaitTimeout):
  55. t.Fatal("took too long to create watch")
  56. }
  57. donec := make(chan struct{})
  58. go func() {
  59. defer close(donec)
  60. // switch to others when eps[lead] is shut down
  61. select {
  62. case ev := <-wch:
  63. if werr := ev.Err(); werr != nil {
  64. t.Error(werr)
  65. }
  66. if len(ev.Events) != 1 {
  67. t.Errorf("expected one event, got %+v", ev)
  68. }
  69. if !bytes.Equal(ev.Events[0].Kv.Value, []byte(val)) {
  70. t.Errorf("expected %q, got %+v", val, ev.Events[0].Kv)
  71. }
  72. case <-time.After(7 * time.Second):
  73. t.Error("took too long to receive events")
  74. }
  75. }()
  76. // shut down eps[lead]
  77. clus.Members[lead].Terminate(t)
  78. // writes to eps[lead+1]
  79. putCli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[(lead+1)%3]}})
  80. if err != nil {
  81. t.Fatal(err)
  82. }
  83. defer putCli.Close()
  84. for {
  85. ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
  86. _, err = putCli.Put(ctx, key, val)
  87. cancel()
  88. if err == nil {
  89. break
  90. }
  91. if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout || err == rpctypes.ErrTimeoutDueToLeaderFail {
  92. continue
  93. }
  94. t.Fatal(err)
  95. }
  96. select {
  97. case <-donec:
  98. case <-time.After(5 * time.Second): // enough time for balancer switch
  99. t.Fatal("took too long to receive events")
  100. }
  101. }
  102. func TestBalancerUnderServerShutdownPut(t *testing.T) {
  103. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  104. _, err := cli.Put(ctx, "foo", "bar")
  105. return err
  106. })
  107. }
  108. func TestBalancerUnderServerShutdownDelete(t *testing.T) {
  109. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  110. _, err := cli.Delete(ctx, "foo")
  111. return err
  112. })
  113. }
  114. func TestBalancerUnderServerShutdownTxn(t *testing.T) {
  115. testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  116. _, err := cli.Txn(ctx).
  117. If(clientv3.Compare(clientv3.Version("foo"), "=", 0)).
  118. Then(clientv3.OpPut("foo", "bar")).
  119. Else(clientv3.OpPut("foo", "baz")).Commit()
  120. return err
  121. })
  122. }
  123. // testBalancerUnderServerShutdownMutable expects that when the member of
  124. // the pinned endpoint is shut down, the balancer switches its endpoints
  125. // and all subsequent put/delete/txn requests succeed with new endpoints.
  126. func testBalancerUnderServerShutdownMutable(t *testing.T, op func(*clientv3.Client, context.Context) error) {
  127. defer testutil.AfterTest(t)
  128. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  129. Size: 3,
  130. SkipCreatingClient: true,
  131. })
  132. defer clus.Terminate(t)
  133. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  134. // pin eps[0]
  135. cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[0]}})
  136. if err != nil {
  137. t.Fatal(err)
  138. }
  139. defer cli.Close()
  140. // wait for eps[0] to be pinned
  141. mustWaitPinReady(t, cli)
  142. // add all eps to list, so that when the original pined one fails
  143. // the client can switch to other available eps
  144. cli.SetEndpoints(eps...)
  145. // shut down eps[0]
  146. clus.Members[0].Terminate(t)
  147. // switched to others when eps[0] was explicitly shut down
  148. // and following request should succeed
  149. // TODO: remove this (expose client connection state?)
  150. time.Sleep(time.Second)
  151. cctx, ccancel := context.WithTimeout(context.Background(), time.Second)
  152. err = op(cli, cctx)
  153. ccancel()
  154. if err != nil {
  155. t.Fatal(err)
  156. }
  157. }
  158. func TestBalancerUnderServerShutdownGetLinearizable(t *testing.T) {
  159. testBalancerUnderServerShutdownImmutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  160. _, err := cli.Get(ctx, "foo")
  161. return err
  162. }, 7*time.Second) // give enough time for leader election, balancer switch
  163. }
  164. func TestBalancerUnderServerShutdownGetSerializable(t *testing.T) {
  165. testBalancerUnderServerShutdownImmutable(t, func(cli *clientv3.Client, ctx context.Context) error {
  166. _, err := cli.Get(ctx, "foo", clientv3.WithSerializable())
  167. return err
  168. }, 2*time.Second)
  169. }
  170. // testBalancerUnderServerShutdownImmutable expects that when the member of
  171. // the pinned endpoint is shut down, the balancer switches its endpoints
  172. // and all subsequent range requests succeed with new endpoints.
  173. func testBalancerUnderServerShutdownImmutable(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) {
  174. defer testutil.AfterTest(t)
  175. clus := integration.NewClusterV3(t, &integration.ClusterConfig{
  176. Size: 3,
  177. SkipCreatingClient: true,
  178. })
  179. defer clus.Terminate(t)
  180. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
  181. // pin eps[0]
  182. cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[0]}})
  183. if err != nil {
  184. t.Errorf("failed to create client: %v", err)
  185. }
  186. defer cli.Close()
  187. // wait for eps[0] to be pinned
  188. mustWaitPinReady(t, cli)
  189. // add all eps to list, so that when the original pined one fails
  190. // the client can switch to other available eps
  191. cli.SetEndpoints(eps...)
  192. // shut down eps[0]
  193. clus.Members[0].Terminate(t)
  194. // switched to others when eps[0] was explicitly shut down
  195. // and following request should succeed
  196. cctx, ccancel := context.WithTimeout(context.Background(), timeout)
  197. err = op(cli, cctx)
  198. ccancel()
  199. if err != nil {
  200. t.Errorf("failed to finish range request in time %v (timeout %v)", err, timeout)
  201. }
  202. }
  203. func TestBalancerUnderServerStopInflightLinearizableGetOnRestart(t *testing.T) {
  204. tt := []pinTestOpt{
  205. {pinLeader: true, stopPinFirst: true},
  206. {pinLeader: true, stopPinFirst: false},
  207. {pinLeader: false, stopPinFirst: true},
  208. {pinLeader: false, stopPinFirst: false},
  209. }
  210. for i := range tt {
  211. testBalancerUnderServerStopInflightRangeOnRestart(t, true, tt[i])
  212. }
  213. }
  214. func TestBalancerUnderServerStopInflightSerializableGetOnRestart(t *testing.T) {
  215. tt := []pinTestOpt{
  216. {pinLeader: true, stopPinFirst: true},
  217. {pinLeader: true, stopPinFirst: false},
  218. {pinLeader: false, stopPinFirst: true},
  219. {pinLeader: false, stopPinFirst: false},
  220. }
  221. for i := range tt {
  222. testBalancerUnderServerStopInflightRangeOnRestart(t, false, tt[i])
  223. }
  224. }
  225. type pinTestOpt struct {
  226. pinLeader bool
  227. stopPinFirst bool
  228. }
  229. // testBalancerUnderServerStopInflightRangeOnRestart expects
  230. // inflight range request reconnects on server restart.
  231. func testBalancerUnderServerStopInflightRangeOnRestart(t *testing.T, linearizable bool, opt pinTestOpt) {
  232. defer testutil.AfterTest(t)
  233. cfg := &integration.ClusterConfig{
  234. Size: 2,
  235. SkipCreatingClient: true,
  236. }
  237. if linearizable {
  238. cfg.Size = 3
  239. }
  240. clus := integration.NewClusterV3(t, cfg)
  241. defer clus.Terminate(t)
  242. eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr()}
  243. if linearizable {
  244. eps = append(eps, clus.Members[2].GRPCAddr())
  245. }
  246. lead := clus.WaitLeader(t)
  247. target := lead
  248. if !opt.pinLeader {
  249. target = (target + 1) % 2
  250. }
  251. // pin eps[target]
  252. cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[target]}})
  253. if err != nil {
  254. t.Errorf("failed to create client: %v", err)
  255. }
  256. defer cli.Close()
  257. // wait for eps[target] to be pinned
  258. mustWaitPinReady(t, cli)
  259. // add all eps to list, so that when the original pined one fails
  260. // the client can switch to other available eps
  261. cli.SetEndpoints(eps...)
  262. if opt.stopPinFirst {
  263. clus.Members[target].Stop(t)
  264. // give some time for balancer switch before stopping the other
  265. time.Sleep(time.Second)
  266. clus.Members[(target+1)%2].Stop(t)
  267. } else {
  268. clus.Members[(target+1)%2].Stop(t)
  269. // balancer cannot pin other member since it's already stopped
  270. clus.Members[target].Stop(t)
  271. }
  272. // 3-second is the minimum interval between endpoint being marked
  273. // as unhealthy and being removed from unhealthy, so possibly
  274. // takes >5-second to unpin and repin an endpoint
  275. // TODO: decrease timeout when balancer switch rewrite
  276. clientTimeout := 7 * time.Second
  277. var gops []clientv3.OpOption
  278. if !linearizable {
  279. gops = append(gops, clientv3.WithSerializable())
  280. }
  281. donec, readyc := make(chan struct{}), make(chan struct{}, 1)
  282. go func() {
  283. defer close(donec)
  284. ctx, cancel := context.WithTimeout(context.TODO(), clientTimeout)
  285. readyc <- struct{}{}
  286. // TODO: The new grpc load balancer will not pin to an endpoint
  287. // as intended by this test. But it will round robin member within
  288. // two attempts.
  289. // Remove retry loop once the new grpc load balancer provides retry.
  290. for i := 0; i < 2; i++ {
  291. _, err = cli.Get(ctx, "abc", gops...)
  292. if err == nil {
  293. break
  294. }
  295. }
  296. cancel()
  297. if err != nil {
  298. t.Errorf("unexpected error: %v", err)
  299. }
  300. }()
  301. <-readyc
  302. clus.Members[target].Restart(t)
  303. select {
  304. case <-time.After(clientTimeout + integration.RequestWaitTimeout):
  305. t.Fatalf("timed out waiting for Get [linearizable: %v, opt: %+v]", linearizable, opt)
  306. case <-donec:
  307. }
  308. }
  309. // e.g. due to clock drifts in server-side,
  310. // client context times out first in server-side
  311. // while original client-side context is not timed out yet
  312. func isServerCtxTimeout(err error) bool {
  313. if err == nil {
  314. return false
  315. }
  316. ev, ok := status.FromError(err)
  317. if !ok {
  318. return false
  319. }
  320. code := ev.Code()
  321. return code == codes.DeadlineExceeded && strings.Contains(err.Error(), "context deadline exceeded")
  322. }
  323. // In grpc v1.11.3+ dial timeouts can error out with transport.ErrConnClosing. Previously dial timeouts
  324. // would always error out with context.DeadlineExceeded.
  325. func isClientTimeout(err error) bool {
  326. if err == nil {
  327. return false
  328. }
  329. if err == context.DeadlineExceeded {
  330. return true
  331. }
  332. ev, ok := status.FromError(err)
  333. if !ok {
  334. return false
  335. }
  336. code := ev.Code()
  337. return code == codes.DeadlineExceeded
  338. }
  339. func isCanceled(err error) bool {
  340. if err == nil {
  341. return false
  342. }
  343. if err == context.Canceled {
  344. return true
  345. }
  346. ev, ok := status.FromError(err)
  347. if !ok {
  348. return false
  349. }
  350. code := ev.Code()
  351. return code == codes.Canceled
  352. }
  353. func isUnavailable(err error) bool {
  354. if err == nil {
  355. return false
  356. }
  357. if err == context.Canceled {
  358. return true
  359. }
  360. ev, ok := status.FromError(err)
  361. if !ok {
  362. return false
  363. }
  364. code := ev.Code()
  365. return code == codes.Unavailable
  366. }