failure_case_failpoints.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "fmt"
  17. "io/ioutil"
  18. "net/http"
  19. "strings"
  20. "sync"
  21. "time"
  22. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  23. )
  24. type failpointStats struct {
  25. mu sync.Mutex
  26. // crashes counts the number of crashes for a failpoint
  27. crashes map[string]int
  28. }
  29. var fpStats failpointStats
  30. func failpointFailures(clus *Cluster) (ret []Failure, err error) {
  31. var fps []string
  32. fps, err = failpointPaths(clus.Members[0].FailpointHTTPAddr)
  33. if err != nil {
  34. return nil, err
  35. }
  36. // create failure objects for all failpoints
  37. for _, fp := range fps {
  38. if len(fp) == 0 {
  39. continue
  40. }
  41. fpFails := failuresFromFailpoint(fp, clus.Tester.FailpointCommands)
  42. // wrap in delays so failpoint has time to trigger
  43. for i, fpf := range fpFails {
  44. if strings.Contains(fp, "Snap") {
  45. // hack to trigger snapshot failpoints
  46. fpFails[i] = &failureUntilSnapshot{
  47. desc: desc(fpf.Desc()),
  48. failureCase: rpcpb.FailureCase_FAILPOINTS,
  49. Failure: fpf,
  50. }
  51. } else {
  52. fpFails[i] = &failureDelay{
  53. Failure: fpf,
  54. delayDuration: 3 * time.Second,
  55. }
  56. }
  57. }
  58. ret = append(ret, fpFails...)
  59. }
  60. fpStats.crashes = make(map[string]int)
  61. return ret, err
  62. }
  63. func failpointPaths(endpoint string) ([]string, error) {
  64. resp, err := http.Get(endpoint)
  65. if err != nil {
  66. return nil, err
  67. }
  68. defer resp.Body.Close()
  69. body, rerr := ioutil.ReadAll(resp.Body)
  70. if rerr != nil {
  71. return nil, rerr
  72. }
  73. var fps []string
  74. for _, l := range strings.Split(string(body), "\n") {
  75. fp := strings.Split(l, "=")[0]
  76. fps = append(fps, fp)
  77. }
  78. return fps, nil
  79. }
  80. // failpoints follows FreeBSD KFAIL_POINT syntax.
  81. // e.g. panic("etcd-tester"),1*sleep(1000)->panic("etcd-tester")
  82. func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure) {
  83. recov := makeRecoverFailpoint(fp)
  84. for _, fcmd := range failpointCommands {
  85. inject := makeInjectFailpoint(fp, fcmd)
  86. fs = append(fs, []Failure{
  87. &failureFollower{
  88. failureByFunc: failureByFunc{
  89. desc: desc(fmt.Sprintf("failpoint %q (one: %q)", fp, fcmd)),
  90. failureCase: rpcpb.FailureCase_FAILPOINTS,
  91. injectMember: inject,
  92. recoverMember: recov,
  93. },
  94. last: -1,
  95. lead: -1,
  96. },
  97. &failureLeader{
  98. failureByFunc: failureByFunc{
  99. desc: desc(fmt.Sprintf("failpoint %q (leader: %q)", fp, fcmd)),
  100. failureCase: rpcpb.FailureCase_FAILPOINTS,
  101. injectMember: inject,
  102. recoverMember: recov,
  103. },
  104. last: -1,
  105. lead: -1,
  106. },
  107. &failureQuorum{
  108. desc: desc(fmt.Sprintf("failpoint %q (quorum: %q)", fp, fcmd)),
  109. failureCase: rpcpb.FailureCase_FAILPOINTS,
  110. injectMember: inject,
  111. recoverMember: recov,
  112. },
  113. &failureAll{
  114. desc: desc(fmt.Sprintf("failpoint %q (all: %q)", fp, fcmd)),
  115. failureCase: rpcpb.FailureCase_FAILPOINTS,
  116. injectMember: inject,
  117. recoverMember: recov,
  118. },
  119. }...)
  120. }
  121. return fs
  122. }
  123. func makeInjectFailpoint(fp, val string) injectMemberFunc {
  124. return func(clus *Cluster, idx int) (err error) {
  125. return putFailpoint(clus.Members[idx].FailpointHTTPAddr, fp, val)
  126. }
  127. }
  128. func makeRecoverFailpoint(fp string) recoverMemberFunc {
  129. return func(clus *Cluster, idx int) error {
  130. if err := delFailpoint(clus.Members[idx].FailpointHTTPAddr, fp); err == nil {
  131. return nil
  132. }
  133. // node not responding, likely dead from fp panic; restart
  134. fpStats.mu.Lock()
  135. fpStats.crashes[fp]++
  136. fpStats.mu.Unlock()
  137. return recoverKill(clus, idx)
  138. }
  139. }
  140. func putFailpoint(ep, fp, val string) error {
  141. req, _ := http.NewRequest(http.MethodPut, ep+"/"+fp, strings.NewReader(val))
  142. c := http.Client{}
  143. resp, err := c.Do(req)
  144. if err != nil {
  145. return err
  146. }
  147. resp.Body.Close()
  148. if resp.StatusCode/100 != 2 {
  149. return fmt.Errorf("failed to PUT %s=%s at %s (%v)", fp, val, ep, resp.Status)
  150. }
  151. return nil
  152. }
  153. func delFailpoint(ep, fp string) error {
  154. req, _ := http.NewRequest(http.MethodDelete, ep+"/"+fp, strings.NewReader(""))
  155. c := http.Client{}
  156. resp, err := c.Do(req)
  157. if err != nil {
  158. return err
  159. }
  160. resp.Body.Close()
  161. if resp.StatusCode/100 != 2 {
  162. return fmt.Errorf("failed to DELETE %s at %s (%v)", fp, ep, resp.Status)
  163. }
  164. return nil
  165. }