agent.go 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "fmt"
  17. "net"
  18. "net/url"
  19. "os"
  20. "os/exec"
  21. "path/filepath"
  22. "strconv"
  23. "sync"
  24. "syscall"
  25. "time"
  26. "github.com/coreos/etcd/pkg/fileutil"
  27. "github.com/coreos/etcd/pkg/transport"
  28. "github.com/coreos/etcd/tools/functional-tester/etcd-agent/client"
  29. )
  30. const (
  31. stateUninitialized = "uninitialized"
  32. stateStarted = "started"
  33. stateStopped = "stopped"
  34. stateTerminated = "terminated"
  35. )
  36. type Agent struct {
  37. state string // the state of etcd process
  38. cmd *exec.Cmd
  39. logfile *os.File
  40. cfg AgentConfig
  41. pmu sync.Mutex
  42. advertisePortToProxy map[int]transport.Proxy
  43. }
  44. type AgentConfig struct {
  45. EtcdPath string
  46. LogDir string
  47. FailpointAddr string
  48. }
  49. func newAgent(cfg AgentConfig) (*Agent, error) {
  50. // check if the file exists
  51. _, err := os.Stat(cfg.EtcdPath)
  52. if err != nil {
  53. return nil, err
  54. }
  55. c := exec.Command(cfg.EtcdPath)
  56. err = fileutil.TouchDirAll(cfg.LogDir)
  57. if err != nil {
  58. return nil, err
  59. }
  60. var f *os.File
  61. f, err = os.Create(filepath.Join(cfg.LogDir, "etcd.log"))
  62. if err != nil {
  63. return nil, err
  64. }
  65. return &Agent{
  66. state: stateUninitialized,
  67. cmd: c,
  68. logfile: f,
  69. cfg: cfg,
  70. advertisePortToProxy: make(map[int]transport.Proxy),
  71. }, nil
  72. }
  73. // start starts a new etcd process with the given args.
  74. func (a *Agent) start(args ...string) error {
  75. args = append(args, "--data-dir", a.dataDir())
  76. a.cmd = exec.Command(a.cmd.Path, args...)
  77. a.cmd.Env = []string{"GOFAIL_HTTP=" + a.cfg.FailpointAddr}
  78. a.cmd.Stdout = a.logfile
  79. a.cmd.Stderr = a.logfile
  80. err := a.cmd.Start()
  81. if err != nil {
  82. return err
  83. }
  84. a.state = stateStarted
  85. a.pmu.Lock()
  86. defer a.pmu.Unlock()
  87. if len(a.advertisePortToProxy) == 0 {
  88. // enough time for etcd start before setting up proxy
  89. time.Sleep(time.Second)
  90. var (
  91. err error
  92. s string
  93. listenClientURL *url.URL
  94. advertiseClientURL *url.URL
  95. advertiseClientURLPort int
  96. listenPeerURL *url.URL
  97. advertisePeerURL *url.URL
  98. advertisePeerURLPort int
  99. )
  100. for i := range args {
  101. switch args[i] {
  102. case "--listen-client-urls":
  103. listenClientURL, err = url.Parse(args[i+1])
  104. if err != nil {
  105. return err
  106. }
  107. case "--advertise-client-urls":
  108. advertiseClientURL, err = url.Parse(args[i+1])
  109. if err != nil {
  110. return err
  111. }
  112. _, s, err = net.SplitHostPort(advertiseClientURL.Host)
  113. if err != nil {
  114. return err
  115. }
  116. advertiseClientURLPort, err = strconv.Atoi(s)
  117. if err != nil {
  118. return err
  119. }
  120. case "--listen-peer-urls":
  121. listenPeerURL, err = url.Parse(args[i+1])
  122. if err != nil {
  123. return err
  124. }
  125. case "--initial-advertise-peer-urls":
  126. advertisePeerURL, err = url.Parse(args[i+1])
  127. if err != nil {
  128. return err
  129. }
  130. _, s, err = net.SplitHostPort(advertisePeerURL.Host)
  131. if err != nil {
  132. return err
  133. }
  134. advertisePeerURLPort, err = strconv.Atoi(s)
  135. if err != nil {
  136. return err
  137. }
  138. }
  139. }
  140. clientProxy := transport.NewProxy(transport.ProxyConfig{
  141. From: *advertiseClientURL,
  142. To: *listenClientURL,
  143. })
  144. select {
  145. case err = <-clientProxy.Error():
  146. return err
  147. case <-time.After(time.Second):
  148. }
  149. a.advertisePortToProxy[advertiseClientURLPort] = clientProxy
  150. peerProxy := transport.NewProxy(transport.ProxyConfig{
  151. From: *advertisePeerURL,
  152. To: *listenPeerURL,
  153. })
  154. select {
  155. case err = <-peerProxy.Error():
  156. return err
  157. case <-time.After(time.Second):
  158. }
  159. a.advertisePortToProxy[advertisePeerURLPort] = peerProxy
  160. }
  161. return nil
  162. }
  163. // stop stops the existing etcd process the agent started.
  164. func (a *Agent) stopWithSig(sig os.Signal) error {
  165. if a.state != stateStarted {
  166. return nil
  167. }
  168. a.pmu.Lock()
  169. if len(a.advertisePortToProxy) > 0 {
  170. for _, p := range a.advertisePortToProxy {
  171. if err := p.Close(); err != nil {
  172. a.pmu.Unlock()
  173. return err
  174. }
  175. select {
  176. case <-p.Done():
  177. // enough time to release port
  178. time.Sleep(time.Second)
  179. case <-time.After(time.Second):
  180. }
  181. }
  182. a.advertisePortToProxy = make(map[int]transport.Proxy)
  183. }
  184. a.pmu.Unlock()
  185. err := stopWithSig(a.cmd, sig)
  186. if err != nil {
  187. return err
  188. }
  189. a.state = stateStopped
  190. return nil
  191. }
  192. func stopWithSig(cmd *exec.Cmd, sig os.Signal) error {
  193. err := cmd.Process.Signal(sig)
  194. if err != nil {
  195. return err
  196. }
  197. errc := make(chan error)
  198. go func() {
  199. _, ew := cmd.Process.Wait()
  200. errc <- ew
  201. close(errc)
  202. }()
  203. select {
  204. case <-time.After(5 * time.Second):
  205. cmd.Process.Kill()
  206. case e := <-errc:
  207. return e
  208. }
  209. err = <-errc
  210. return err
  211. }
  212. // restart restarts the stopped etcd process.
  213. func (a *Agent) restart() error {
  214. return a.start(a.cmd.Args[1:]...)
  215. }
  216. func (a *Agent) cleanup() error {
  217. // exit with stackstrace
  218. if err := a.stopWithSig(syscall.SIGQUIT); err != nil {
  219. return err
  220. }
  221. a.state = stateUninitialized
  222. a.logfile.Close()
  223. if err := archiveLogAndDataDir(a.cfg.LogDir, a.dataDir()); err != nil {
  224. return err
  225. }
  226. if err := fileutil.TouchDirAll(a.cfg.LogDir); err != nil {
  227. return err
  228. }
  229. f, err := os.Create(filepath.Join(a.cfg.LogDir, "etcd.log"))
  230. if err != nil {
  231. return err
  232. }
  233. a.logfile = f
  234. // https://www.kernel.org/doc/Documentation/sysctl/vm.txt
  235. // https://github.com/torvalds/linux/blob/master/fs/drop_caches.c
  236. cmd := exec.Command("/bin/sh", "-c", `echo "echo 1 > /proc/sys/vm/drop_caches" | sudo sh`)
  237. if err := cmd.Run(); err != nil {
  238. plog.Infof("error when cleaning page cache (%v)", err)
  239. }
  240. return nil
  241. }
  242. // terminate stops the exiting etcd process the agent started
  243. // and removes the data dir.
  244. func (a *Agent) terminate() error {
  245. err := a.stopWithSig(syscall.SIGTERM)
  246. if err != nil {
  247. return err
  248. }
  249. err = os.RemoveAll(a.dataDir())
  250. if err != nil {
  251. return err
  252. }
  253. a.state = stateTerminated
  254. return nil
  255. }
  256. func (a *Agent) dropPort(port int) error {
  257. a.pmu.Lock()
  258. defer a.pmu.Unlock()
  259. p, ok := a.advertisePortToProxy[port]
  260. if !ok {
  261. return fmt.Errorf("%d does not have proxy", port)
  262. }
  263. p.BlackholeTx()
  264. p.BlackholeRx()
  265. return nil
  266. }
  267. func (a *Agent) recoverPort(port int) error {
  268. a.pmu.Lock()
  269. defer a.pmu.Unlock()
  270. p, ok := a.advertisePortToProxy[port]
  271. if !ok {
  272. return fmt.Errorf("%d does not have proxy", port)
  273. }
  274. p.UnblackholeTx()
  275. p.UnblackholeRx()
  276. return nil
  277. }
  278. func (a *Agent) setLatency(ms, rv int) error {
  279. a.pmu.Lock()
  280. defer a.pmu.Unlock()
  281. if ms == 0 {
  282. for _, p := range a.advertisePortToProxy {
  283. p.UndelayTx()
  284. p.UndelayRx()
  285. }
  286. }
  287. for _, p := range a.advertisePortToProxy {
  288. p.DelayTx(time.Duration(ms)*time.Millisecond, time.Duration(rv)*time.Millisecond)
  289. p.DelayRx(time.Duration(ms)*time.Millisecond, time.Duration(rv)*time.Millisecond)
  290. }
  291. return nil
  292. }
  293. func (a *Agent) status() client.Status {
  294. return client.Status{State: a.state}
  295. }
  296. func (a *Agent) dataDir() string {
  297. return filepath.Join(a.cfg.LogDir, "etcd.data")
  298. }
  299. func existDir(fpath string) bool {
  300. st, err := os.Stat(fpath)
  301. if err != nil {
  302. if os.IsNotExist(err) {
  303. return false
  304. }
  305. } else {
  306. return st.IsDir()
  307. }
  308. return false
  309. }
  310. func archiveLogAndDataDir(logDir string, datadir string) error {
  311. dir := filepath.Join(logDir, "failure_archive", time.Now().Format(time.RFC3339))
  312. if existDir(dir) {
  313. dir = filepath.Join(logDir, "failure_archive", time.Now().Add(time.Second).Format(time.RFC3339))
  314. }
  315. if err := fileutil.TouchDirAll(dir); err != nil {
  316. return err
  317. }
  318. if err := os.Rename(filepath.Join(logDir, "etcd.log"), filepath.Join(dir, "etcd.log")); err != nil {
  319. if !os.IsNotExist(err) {
  320. return err
  321. }
  322. }
  323. if err := os.Rename(datadir, filepath.Join(dir, filepath.Base(datadir))); err != nil {
  324. if !os.IsNotExist(err) {
  325. return err
  326. }
  327. }
  328. return nil
  329. }