etcd.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. package etcd
  2. import (
  3. "crypto/tls"
  4. "encoding/json"
  5. "fmt"
  6. "log"
  7. "net/http"
  8. "net/url"
  9. "path"
  10. "time"
  11. "github.com/coreos/etcd/config"
  12. etcdErr "github.com/coreos/etcd/error"
  13. "github.com/coreos/etcd/raft"
  14. "github.com/coreos/etcd/store"
  15. )
  16. const (
  17. defaultHeartbeat = 1
  18. defaultElection = 5
  19. maxBufferedProposal = 128
  20. defaultTickDuration = time.Millisecond * 100
  21. v2machineKVPrefix = "/_etcd/machines"
  22. v2configKVPrefix = "/_etcd/config"
  23. v2Prefix = "/v2/keys"
  24. v2machinePrefix = "/v2/machines"
  25. v2peersPrefix = "/v2/peers"
  26. v2LeaderPrefix = "/v2/leader"
  27. v2StoreStatsPrefix = "/v2/stats/store"
  28. v2adminConfigPrefix = "/v2/admin/config"
  29. v2adminMachinesPrefix = "/v2/admin/machines/"
  30. raftPrefix = "/raft"
  31. )
  32. const (
  33. participant = iota
  34. standby
  35. stop
  36. )
  37. var (
  38. tmpErr = fmt.Errorf("try again")
  39. raftStopErr = fmt.Errorf("raft is stopped")
  40. noneId int64 = -1
  41. )
  42. type Server struct {
  43. config *config.Config
  44. mode int
  45. id int64
  46. pubAddr string
  47. raftPubAddr string
  48. nodes map[string]bool
  49. peerHub *peerHub
  50. tickDuration time.Duration
  51. client *v2client
  52. rh *raftHandler
  53. node *v2Raft
  54. store.Store
  55. // participant mode vars
  56. proposal chan v2Proposal
  57. addNodeC chan raft.Config
  58. removeNodeC chan raft.Config
  59. // standby mode vars
  60. leader int64
  61. leaderAddr string
  62. clusterConf *config.ClusterConfig
  63. modeC chan int
  64. stop chan struct{}
  65. participantHandler http.Handler
  66. standbyHandler http.Handler
  67. }
  68. func New(c *config.Config, id int64) *Server {
  69. if err := c.Sanitize(); err != nil {
  70. log.Fatalf("failed sanitizing configuration: %v", err)
  71. }
  72. tc := &tls.Config{
  73. InsecureSkipVerify: true,
  74. }
  75. var err error
  76. if c.PeerTLSInfo().Scheme() == "https" {
  77. tc, err = c.PeerTLSInfo().ClientConfig()
  78. if err != nil {
  79. log.Fatal("failed to create raft transporter tls:", err)
  80. }
  81. }
  82. tr := new(http.Transport)
  83. tr.TLSClientConfig = tc
  84. client := &http.Client{Transport: tr}
  85. s := &Server{
  86. config: c,
  87. id: id,
  88. pubAddr: c.Addr,
  89. raftPubAddr: c.Peer.Addr,
  90. nodes: make(map[string]bool),
  91. peerHub: newPeerHub(client),
  92. tickDuration: defaultTickDuration,
  93. node: &v2Raft{
  94. Node: raft.New(id, defaultHeartbeat, defaultElection),
  95. result: make(map[wait]chan interface{}),
  96. },
  97. addNodeC: make(chan raft.Config),
  98. removeNodeC: make(chan raft.Config),
  99. client: newClient(tc),
  100. Store: store.New(),
  101. modeC: make(chan int, 10),
  102. stop: make(chan struct{}),
  103. }
  104. s.rh = newRaftHandler(s.peerHub)
  105. for _, seed := range c.Peers {
  106. s.nodes[seed] = true
  107. }
  108. m := http.NewServeMux()
  109. m.Handle(v2Prefix+"/", handlerErr(s.serveValue))
  110. m.Handle(v2machinePrefix, handlerErr(s.serveMachines))
  111. m.Handle(v2peersPrefix, handlerErr(s.serveMachines))
  112. m.Handle(v2LeaderPrefix, handlerErr(s.serveLeader))
  113. m.Handle(v2StoreStatsPrefix, handlerErr(s.serveStoreStats))
  114. m.Handle(v2adminConfigPrefix, handlerErr(s.serveAdminConfig))
  115. m.Handle(v2adminMachinesPrefix, handlerErr(s.serveAdminMachines))
  116. s.participantHandler = m
  117. m = http.NewServeMux()
  118. m.Handle("/", handlerErr(s.serveRedirect))
  119. s.standbyHandler = m
  120. return s
  121. }
  122. func (s *Server) SetTick(d time.Duration) {
  123. s.tickDuration = d
  124. }
  125. func (s *Server) RaftHandler() http.Handler {
  126. return s.rh
  127. }
  128. func (s *Server) ClusterConfig() *config.ClusterConfig {
  129. c := config.NewClusterConfig()
  130. // This is used for backward compatibility because it doesn't
  131. // set cluster config in older version.
  132. if e, err := s.Get(v2configKVPrefix, false, false); err == nil {
  133. json.Unmarshal([]byte(*e.Node.Value), c)
  134. }
  135. return c
  136. }
  137. func (s *Server) Run() {
  138. if len(s.config.Peers) == 0 {
  139. s.Bootstrap()
  140. } else {
  141. s.Join()
  142. }
  143. }
  144. func (s *Server) Stop() {
  145. if s.mode == stop {
  146. return
  147. }
  148. s.mode = stop
  149. s.rh.stop()
  150. s.client.CloseConnections()
  151. s.peerHub.stop()
  152. close(s.stop)
  153. }
  154. func (s *Server) Bootstrap() {
  155. log.Println("starting a bootstrap node")
  156. s.node.Campaign()
  157. s.node.Add(s.id, s.raftPubAddr, []byte(s.pubAddr))
  158. s.apply(s.node.Next())
  159. s.run()
  160. }
  161. func (s *Server) Join() {
  162. log.Println("joining cluster via peers", s.config.Peers)
  163. info := &context{
  164. MinVersion: store.MinVersion(),
  165. MaxVersion: store.MaxVersion(),
  166. ClientURL: s.pubAddr,
  167. PeerURL: s.raftPubAddr,
  168. }
  169. url := ""
  170. for i := 0; i < 5; i++ {
  171. for seed := range s.nodes {
  172. if err := s.client.AddMachine(seed, fmt.Sprint(s.id), info); err == nil {
  173. url = seed
  174. break
  175. } else {
  176. log.Println(err)
  177. }
  178. }
  179. if url != "" {
  180. break
  181. }
  182. time.Sleep(100 * time.Millisecond)
  183. }
  184. s.nodes = map[string]bool{url: true}
  185. s.run()
  186. }
  187. func (s *Server) Add(id int64, raftPubAddr string, pubAddr string) error {
  188. p := path.Join(v2machineKVPrefix, fmt.Sprint(id))
  189. _, err := s.Get(p, false, false)
  190. if err == nil {
  191. return nil
  192. }
  193. if v, ok := err.(*etcdErr.Error); !ok || v.ErrorCode != etcdErr.EcodeKeyNotFound {
  194. return err
  195. }
  196. w, err := s.Watch(p, true, false, 0)
  197. if err != nil {
  198. log.Println("add error:", err)
  199. return tmpErr
  200. }
  201. if s.mode != participant {
  202. return raftStopErr
  203. }
  204. select {
  205. case s.addNodeC <- raft.Config{NodeId: id, Addr: raftPubAddr, Context: []byte(pubAddr)}:
  206. default:
  207. w.Remove()
  208. log.Println("unable to send out addNode proposal")
  209. return tmpErr
  210. }
  211. select {
  212. case v := <-w.EventChan:
  213. if v.Action == store.Set {
  214. return nil
  215. }
  216. log.Println("add error: action =", v.Action)
  217. return tmpErr
  218. case <-time.After(6 * defaultHeartbeat * s.tickDuration):
  219. w.Remove()
  220. log.Println("add error: wait timeout")
  221. return tmpErr
  222. }
  223. }
  224. func (s *Server) Remove(id int64) error {
  225. p := path.Join(v2machineKVPrefix, fmt.Sprint(id))
  226. v, err := s.Get(p, false, false)
  227. if err != nil {
  228. return nil
  229. }
  230. if s.mode != participant {
  231. return raftStopErr
  232. }
  233. select {
  234. case s.removeNodeC <- raft.Config{NodeId: id}:
  235. default:
  236. log.Println("unable to send out removeNode proposal")
  237. return tmpErr
  238. }
  239. // TODO(xiangli): do not need to watch if the
  240. // removal target is self
  241. w, err := s.Watch(p, true, false, v.Index()+1)
  242. if err != nil {
  243. log.Println("remove error:", err)
  244. return tmpErr
  245. }
  246. select {
  247. case v := <-w.EventChan:
  248. if v.Action == store.Delete {
  249. return nil
  250. }
  251. log.Println("remove error: action =", v.Action)
  252. return tmpErr
  253. case <-time.After(6 * defaultHeartbeat * s.tickDuration):
  254. w.Remove()
  255. log.Println("remove error: wait timeout")
  256. return tmpErr
  257. }
  258. }
  259. func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  260. switch s.mode {
  261. case participant:
  262. s.participantHandler.ServeHTTP(w, r)
  263. case standby:
  264. s.standbyHandler.ServeHTTP(w, r)
  265. case stop:
  266. http.Error(w, "server is stopped", http.StatusInternalServerError)
  267. }
  268. }
  269. func (s *Server) initParticipant() {
  270. s.proposal = make(chan v2Proposal, maxBufferedProposal)
  271. s.addNodeC = make(chan raft.Config, 1)
  272. s.removeNodeC = make(chan raft.Config, 1)
  273. s.rh.start()
  274. }
  275. func (s *Server) initStandby() {
  276. s.leader = noneId
  277. s.leaderAddr = ""
  278. s.clusterConf = config.NewClusterConfig()
  279. }
  280. func (s *Server) run() {
  281. for {
  282. select {
  283. case s.modeC <- s.mode:
  284. default:
  285. }
  286. switch s.mode {
  287. case participant:
  288. s.initParticipant()
  289. s.runParticipant()
  290. case standby:
  291. s.initStandby()
  292. s.runStandby()
  293. case stop:
  294. return
  295. default:
  296. panic("unsupport mode")
  297. }
  298. }
  299. }
  300. func (s *Server) runParticipant() {
  301. defer func() {
  302. s.node.StopProposalWaiters()
  303. s.rh.stop()
  304. }()
  305. node := s.node
  306. recv := s.rh.recv
  307. ticker := time.NewTicker(s.tickDuration)
  308. v2SyncTicker := time.NewTicker(time.Millisecond * 500)
  309. var proposal chan v2Proposal
  310. var addNodeC, removeNodeC chan raft.Config
  311. for {
  312. if node.HasLeader() {
  313. proposal = s.proposal
  314. addNodeC = s.addNodeC
  315. removeNodeC = s.removeNodeC
  316. } else {
  317. proposal = nil
  318. addNodeC = nil
  319. removeNodeC = nil
  320. }
  321. select {
  322. case p := <-proposal:
  323. node.Propose(p)
  324. case c := <-addNodeC:
  325. node.UpdateConf(raft.AddNode, &c)
  326. case c := <-removeNodeC:
  327. node.UpdateConf(raft.RemoveNode, &c)
  328. case msg := <-recv:
  329. node.Step(*msg)
  330. case <-ticker.C:
  331. node.Tick()
  332. case <-v2SyncTicker.C:
  333. node.Sync()
  334. case <-s.stop:
  335. log.Printf("Node: %d stopped\n", s.id)
  336. return
  337. }
  338. s.apply(node.Next())
  339. s.send(node.Msgs())
  340. if node.IsRemoved() {
  341. break
  342. }
  343. }
  344. log.Printf("Node: %d removed to standby mode\n", s.id)
  345. s.mode = standby
  346. return
  347. }
  348. func (s *Server) runStandby() {
  349. syncDuration := time.Duration(int64(s.clusterConf.SyncInterval * float64(time.Second)))
  350. if err := s.syncCluster(); err != nil {
  351. log.Println("standby sync:", err)
  352. }
  353. for {
  354. select {
  355. case <-time.After(syncDuration):
  356. case <-s.stop:
  357. log.Printf("Node: %d stopped\n", s.id)
  358. return
  359. }
  360. if err := s.syncCluster(); err != nil {
  361. log.Println("standby sync:", err)
  362. continue
  363. }
  364. if s.clusterConf.ActiveSize <= len(s.nodes) {
  365. continue
  366. }
  367. if err := s.joinByPeer(s.leaderAddr); err != nil {
  368. log.Println("standby join:", err)
  369. continue
  370. }
  371. break
  372. }
  373. log.Printf("Node: %d removed to participant mode\n", s.id)
  374. // TODO(yichengq): use old v2Raft
  375. // 1. reject proposal in leader state when sm is removed
  376. // 2. record removeIndex in node to ignore msgDenial and old removal
  377. s.node = &v2Raft{
  378. Node: raft.New(s.id, defaultHeartbeat, defaultElection),
  379. result: make(map[wait]chan interface{}),
  380. }
  381. s.Store = store.New()
  382. s.mode = participant
  383. return
  384. }
  385. func (s *Server) apply(ents []raft.Entry) {
  386. offset := s.node.Applied() - int64(len(ents)) + 1
  387. for i, ent := range ents {
  388. switch ent.Type {
  389. // expose raft entry type
  390. case raft.Normal:
  391. if len(ent.Data) == 0 {
  392. continue
  393. }
  394. s.v2apply(offset+int64(i), ent)
  395. case raft.AddNode:
  396. cfg := new(raft.Config)
  397. if err := json.Unmarshal(ent.Data, cfg); err != nil {
  398. log.Println(err)
  399. break
  400. }
  401. if err := s.peerHub.add(cfg.NodeId, cfg.Addr); err != nil {
  402. log.Println(err)
  403. break
  404. }
  405. peer, err := s.peerHub.peer(cfg.NodeId)
  406. if err != nil {
  407. log.Fatal("cannot get the added peer:", err)
  408. }
  409. peer.participate()
  410. log.Printf("Add Node %x %v %v\n", cfg.NodeId, cfg.Addr, string(cfg.Context))
  411. p := path.Join(v2machineKVPrefix, fmt.Sprint(cfg.NodeId))
  412. if _, err := s.Store.Set(p, false, fmt.Sprintf("raft=%v&etcd=%v", cfg.Addr, string(cfg.Context)), store.Permanent); err == nil {
  413. s.nodes[cfg.Addr] = true
  414. }
  415. case raft.RemoveNode:
  416. cfg := new(raft.Config)
  417. if err := json.Unmarshal(ent.Data, cfg); err != nil {
  418. log.Println(err)
  419. break
  420. }
  421. log.Printf("Remove Node %x\n", cfg.NodeId)
  422. delete(s.nodes, s.fetchAddrFromStore(cfg.NodeId))
  423. peer, err := s.peerHub.peer(cfg.NodeId)
  424. if err != nil {
  425. log.Fatal("cannot get the added peer:", err)
  426. }
  427. peer.idle()
  428. p := path.Join(v2machineKVPrefix, fmt.Sprint(cfg.NodeId))
  429. s.Store.Delete(p, false, false)
  430. default:
  431. panic("unimplemented")
  432. }
  433. }
  434. }
  435. func (s *Server) send(msgs []raft.Message) {
  436. for i := range msgs {
  437. data, err := json.Marshal(msgs[i])
  438. if err != nil {
  439. // todo(xiangli): error handling
  440. log.Fatal(err)
  441. }
  442. if err = s.peerHub.send(msgs[i].To, data); err == nil {
  443. continue
  444. }
  445. if err == errUnknownNode {
  446. err = s.fetchAddr(msgs[i].To)
  447. }
  448. if err == nil {
  449. err = s.peerHub.send(msgs[i].To, data)
  450. }
  451. if err != nil {
  452. log.Println(err)
  453. }
  454. }
  455. }
  456. func (s *Server) setClusterConfig(c *config.ClusterConfig) error {
  457. b, err := json.Marshal(c)
  458. if err != nil {
  459. return err
  460. }
  461. if _, err := s.Set(v2configKVPrefix, false, string(b), store.Permanent); err != nil {
  462. return err
  463. }
  464. return nil
  465. }
  466. func (s *Server) fetchAddr(nodeId int64) error {
  467. for seed := range s.nodes {
  468. if err := s.peerHub.fetch(seed, nodeId); err == nil {
  469. return nil
  470. }
  471. }
  472. return fmt.Errorf("cannot fetch the address of node %d", nodeId)
  473. }
  474. func (s *Server) fetchAddrFromStore(nodeId int64) string {
  475. p := path.Join(v2machineKVPrefix, fmt.Sprint(nodeId))
  476. if ev, err := s.Get(p, false, false); err == nil {
  477. if m, err := url.ParseQuery(*ev.Node.Value); err == nil {
  478. return m["raft"][0]
  479. }
  480. }
  481. return ""
  482. }
  483. func (s *Server) joinByPeer(addr string) error {
  484. info := &context{
  485. MinVersion: store.MinVersion(),
  486. MaxVersion: store.MaxVersion(),
  487. ClientURL: s.pubAddr,
  488. PeerURL: s.raftPubAddr,
  489. }
  490. if err := s.client.AddMachine(s.leaderAddr, fmt.Sprint(s.id), info); err != nil {
  491. return err
  492. }
  493. return nil
  494. }