case_failpoints.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "fmt"
  17. "io/ioutil"
  18. "net/http"
  19. "strings"
  20. "sync"
  21. "github.com/coreos/etcd/functional/rpcpb"
  22. )
  23. type failpointStats struct {
  24. mu sync.Mutex
  25. // crashes counts the number of crashes for a failpoint
  26. crashes map[string]int
  27. }
  28. var fpStats failpointStats
  29. func failpointFailures(clus *Cluster) (ret []Case, err error) {
  30. var fps []string
  31. fps, err = failpointPaths(clus.Members[0].FailpointHTTPAddr)
  32. if err != nil {
  33. return nil, err
  34. }
  35. // create failure objects for all failpoints
  36. for _, fp := range fps {
  37. if len(fp) == 0 {
  38. continue
  39. }
  40. fpFails := casesFromFailpoint(fp, clus.Tester.FailpointCommands)
  41. // wrap in delays so failpoint has time to trigger
  42. for i, fpf := range fpFails {
  43. if strings.Contains(fp, "Snap") {
  44. // hack to trigger snapshot failpoints
  45. fpFails[i] = &caseUntilSnapshot{
  46. desc: fpf.Desc(),
  47. rpcpbCase: rpcpb.Case_FAILPOINTS,
  48. Case: fpf,
  49. }
  50. } else {
  51. fpFails[i] = &caseDelay{
  52. Case: fpf,
  53. delayDuration: clus.GetCaseDelayDuration(),
  54. }
  55. }
  56. }
  57. ret = append(ret, fpFails...)
  58. }
  59. fpStats.crashes = make(map[string]int)
  60. return ret, err
  61. }
  62. func failpointPaths(endpoint string) ([]string, error) {
  63. resp, err := http.Get(endpoint)
  64. if err != nil {
  65. return nil, err
  66. }
  67. defer resp.Body.Close()
  68. body, rerr := ioutil.ReadAll(resp.Body)
  69. if rerr != nil {
  70. return nil, rerr
  71. }
  72. var fps []string
  73. for _, l := range strings.Split(string(body), "\n") {
  74. fp := strings.Split(l, "=")[0]
  75. fps = append(fps, fp)
  76. }
  77. return fps, nil
  78. }
  79. // failpoints follows FreeBSD FAIL_POINT syntax.
  80. // e.g. panic("etcd-tester"),1*sleep(1000)->panic("etcd-tester")
  81. func casesFromFailpoint(fp string, failpointCommands []string) (fs []Case) {
  82. recov := makeRecoverFailpoint(fp)
  83. for _, fcmd := range failpointCommands {
  84. inject := makeInjectFailpoint(fp, fcmd)
  85. fs = append(fs, []Case{
  86. &caseFollower{
  87. caseByFunc: caseByFunc{
  88. desc: fmt.Sprintf("failpoint %q (one: %q)", fp, fcmd),
  89. rpcpbCase: rpcpb.Case_FAILPOINTS,
  90. injectMember: inject,
  91. recoverMember: recov,
  92. },
  93. last: -1,
  94. lead: -1,
  95. },
  96. &caseLeader{
  97. caseByFunc: caseByFunc{
  98. desc: fmt.Sprintf("failpoint %q (leader: %q)", fp, fcmd),
  99. rpcpbCase: rpcpb.Case_FAILPOINTS,
  100. injectMember: inject,
  101. recoverMember: recov,
  102. },
  103. last: -1,
  104. lead: -1,
  105. },
  106. &caseQuorum{
  107. caseByFunc: caseByFunc{
  108. desc: fmt.Sprintf("failpoint %q (quorum: %q)", fp, fcmd),
  109. rpcpbCase: rpcpb.Case_FAILPOINTS,
  110. injectMember: inject,
  111. recoverMember: recov,
  112. },
  113. injected: make(map[int]struct{}),
  114. },
  115. &caseAll{
  116. desc: fmt.Sprintf("failpoint %q (all: %q)", fp, fcmd),
  117. rpcpbCase: rpcpb.Case_FAILPOINTS,
  118. injectMember: inject,
  119. recoverMember: recov,
  120. },
  121. }...)
  122. }
  123. return fs
  124. }
  125. func makeInjectFailpoint(fp, val string) injectMemberFunc {
  126. return func(clus *Cluster, idx int) (err error) {
  127. return putFailpoint(clus.Members[idx].FailpointHTTPAddr, fp, val)
  128. }
  129. }
  130. func makeRecoverFailpoint(fp string) recoverMemberFunc {
  131. return func(clus *Cluster, idx int) error {
  132. if err := delFailpoint(clus.Members[idx].FailpointHTTPAddr, fp); err == nil {
  133. return nil
  134. }
  135. // node not responding, likely dead from fp panic; restart
  136. fpStats.mu.Lock()
  137. fpStats.crashes[fp]++
  138. fpStats.mu.Unlock()
  139. return recover_SIGTERM_ETCD(clus, idx)
  140. }
  141. }
  142. func putFailpoint(ep, fp, val string) error {
  143. req, _ := http.NewRequest(http.MethodPut, ep+"/"+fp, strings.NewReader(val))
  144. c := http.Client{}
  145. resp, err := c.Do(req)
  146. if err != nil {
  147. return err
  148. }
  149. resp.Body.Close()
  150. if resp.StatusCode/100 != 2 {
  151. return fmt.Errorf("failed to PUT %s=%s at %s (%v)", fp, val, ep, resp.Status)
  152. }
  153. return nil
  154. }
  155. func delFailpoint(ep, fp string) error {
  156. req, _ := http.NewRequest(http.MethodDelete, ep+"/"+fp, strings.NewReader(""))
  157. c := http.Client{}
  158. resp, err := c.Do(req)
  159. if err != nil {
  160. return err
  161. }
  162. resp.Body.Close()
  163. if resp.StatusCode/100 != 2 {
  164. return fmt.Errorf("failed to DELETE %s at %s (%v)", fp, ep, resp.Status)
  165. }
  166. return nil
  167. }