etcd.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. package etcd
  2. import (
  3. "crypto/tls"
  4. "encoding/json"
  5. "fmt"
  6. "log"
  7. "net/http"
  8. "net/url"
  9. "path"
  10. "time"
  11. "github.com/coreos/etcd/config"
  12. etcdErr "github.com/coreos/etcd/error"
  13. "github.com/coreos/etcd/raft"
  14. "github.com/coreos/etcd/store"
  15. )
  16. const (
  17. defaultHeartbeat = 1
  18. defaultElection = 5
  19. maxBufferedProposal = 128
  20. defaultTickDuration = time.Millisecond * 100
  21. v2machineKVPrefix = "/_etcd/machines"
  22. v2configKVPrefix = "/_etcd/config"
  23. v2Prefix = "/v2/keys"
  24. v2machinePrefix = "/v2/machines"
  25. v2peersPrefix = "/v2/peers"
  26. v2LeaderPrefix = "/v2/leader"
  27. v2StoreStatsPrefix = "/v2/stats/store"
  28. v2adminConfigPrefix = "/v2/admin/config"
  29. v2adminMachinesPrefix = "/v2/admin/machines/"
  30. raftPrefix = "/raft"
  31. )
  32. const (
  33. participant = iota
  34. standby
  35. stop
  36. )
  37. var (
  38. tmpErr = fmt.Errorf("try again")
  39. raftStopErr = fmt.Errorf("raft is stopped")
  40. noneId int64 = -1
  41. )
  42. type Server struct {
  43. config *config.Config
  44. mode int
  45. id int64
  46. pubAddr string
  47. raftPubAddr string
  48. tickDuration time.Duration
  49. nodes map[string]bool
  50. client *v2client
  51. t *transporter
  52. // participant mode vars
  53. proposal chan v2Proposal
  54. node *v2Raft
  55. addNodeC chan raft.Config
  56. removeNodeC chan raft.Config
  57. // standby mode vars
  58. leader int64
  59. leaderAddr string
  60. clusterConf *config.ClusterConfig
  61. store.Store
  62. modeC chan int
  63. stop chan struct{}
  64. participantHandler http.Handler
  65. standbyHandler http.Handler
  66. }
  67. func New(c *config.Config, id int64) *Server {
  68. if err := c.Sanitize(); err != nil {
  69. log.Fatalf("failed sanitizing configuration: %v", err)
  70. }
  71. tc := &tls.Config{
  72. InsecureSkipVerify: true,
  73. }
  74. var err error
  75. if c.PeerTLSInfo().Scheme() == "https" {
  76. tc, err = c.PeerTLSInfo().ClientConfig()
  77. if err != nil {
  78. log.Fatal("failed to create raft transporter tls:", err)
  79. }
  80. }
  81. s := &Server{
  82. config: c,
  83. id: id,
  84. pubAddr: c.Addr,
  85. raftPubAddr: c.Peer.Addr,
  86. nodes: make(map[string]bool),
  87. client: newClient(tc),
  88. t: newTransporter(tc),
  89. tickDuration: defaultTickDuration,
  90. Store: store.New(),
  91. modeC: make(chan int, 10),
  92. stop: make(chan struct{}),
  93. }
  94. node := &v2Raft{
  95. Node: raft.New(id, defaultHeartbeat, defaultElection),
  96. result: make(map[wait]chan interface{}),
  97. }
  98. s.initParticipant(node)
  99. for _, seed := range c.Peers {
  100. s.nodes[seed] = true
  101. }
  102. m := http.NewServeMux()
  103. m.Handle(v2Prefix+"/", handlerErr(s.serveValue))
  104. m.Handle(v2machinePrefix, handlerErr(s.serveMachines))
  105. m.Handle(v2peersPrefix, handlerErr(s.serveMachines))
  106. m.Handle(v2LeaderPrefix, handlerErr(s.serveLeader))
  107. m.Handle(v2StoreStatsPrefix, handlerErr(s.serveStoreStats))
  108. m.Handle(v2adminConfigPrefix, handlerErr(s.serveAdminConfig))
  109. m.Handle(v2adminMachinesPrefix, handlerErr(s.serveAdminMachines))
  110. s.participantHandler = m
  111. m = http.NewServeMux()
  112. m.Handle("/", handlerErr(s.serveRedirect))
  113. s.standbyHandler = m
  114. return s
  115. }
  116. func (s *Server) SetTick(d time.Duration) {
  117. s.tickDuration = d
  118. }
  119. func (s *Server) RaftHandler() http.Handler {
  120. return s.t
  121. }
  122. func (s *Server) ClusterConfig() *config.ClusterConfig {
  123. c := config.NewClusterConfig()
  124. // This is used for backward compatibility because it doesn't
  125. // set cluster config in older version.
  126. if e, err := s.Get(v2configKVPrefix, false, false); err == nil {
  127. json.Unmarshal([]byte(*e.Node.Value), c)
  128. }
  129. return c
  130. }
  131. func (s *Server) Run() {
  132. if len(s.config.Peers) == 0 {
  133. s.Bootstrap()
  134. } else {
  135. s.Join()
  136. }
  137. }
  138. func (s *Server) Stop() {
  139. if s.mode == stop {
  140. return
  141. }
  142. s.mode = stop
  143. s.t.closeConnections()
  144. close(s.stop)
  145. }
  146. func (s *Server) Bootstrap() {
  147. log.Println("starting a bootstrap node")
  148. s.node.Campaign()
  149. s.node.Add(s.id, s.raftPubAddr, []byte(s.pubAddr))
  150. s.apply(s.node.Next())
  151. s.run()
  152. }
  153. func (s *Server) Join() {
  154. log.Println("joining cluster via peers", s.config.Peers)
  155. info := &context{
  156. MinVersion: store.MinVersion(),
  157. MaxVersion: store.MaxVersion(),
  158. ClientURL: s.pubAddr,
  159. PeerURL: s.raftPubAddr,
  160. }
  161. url := ""
  162. for i := 0; i < 5; i++ {
  163. for seed := range s.nodes {
  164. if err := s.client.AddMachine(seed, fmt.Sprint(s.id), info); err == nil {
  165. url = seed
  166. break
  167. } else {
  168. log.Println(err)
  169. }
  170. }
  171. if url != "" {
  172. break
  173. }
  174. time.Sleep(100 * time.Millisecond)
  175. }
  176. s.nodes = map[string]bool{url: true}
  177. s.run()
  178. }
  179. func (s *Server) Add(id int64, raftPubAddr string, pubAddr string) error {
  180. p := path.Join(v2machineKVPrefix, fmt.Sprint(id))
  181. _, err := s.Get(p, false, false)
  182. if err == nil {
  183. return nil
  184. }
  185. if v, ok := err.(*etcdErr.Error); !ok || v.ErrorCode != etcdErr.EcodeKeyNotFound {
  186. return err
  187. }
  188. w, err := s.Watch(p, true, false, 0)
  189. if err != nil {
  190. log.Println("add error:", err)
  191. return tmpErr
  192. }
  193. if s.mode != participant {
  194. return raftStopErr
  195. }
  196. select {
  197. case s.addNodeC <- raft.Config{NodeId: id, Addr: raftPubAddr, Context: []byte(pubAddr)}:
  198. default:
  199. w.Remove()
  200. log.Println("unable to send out addNode proposal")
  201. return tmpErr
  202. }
  203. select {
  204. case v := <-w.EventChan:
  205. if v.Action == store.Set {
  206. return nil
  207. }
  208. log.Println("add error: action =", v.Action)
  209. return tmpErr
  210. case <-time.After(6 * defaultHeartbeat * s.tickDuration):
  211. w.Remove()
  212. log.Println("add error: wait timeout")
  213. return tmpErr
  214. }
  215. }
  216. func (s *Server) Remove(id int64) error {
  217. p := path.Join(v2machineKVPrefix, fmt.Sprint(id))
  218. v, err := s.Get(p, false, false)
  219. if err != nil {
  220. return nil
  221. }
  222. if s.mode != participant {
  223. return raftStopErr
  224. }
  225. select {
  226. case s.removeNodeC <- raft.Config{NodeId: id}:
  227. default:
  228. log.Println("unable to send out removeNode proposal")
  229. return tmpErr
  230. }
  231. // TODO(xiangli): do not need to watch if the
  232. // removal target is self
  233. w, err := s.Watch(p, true, false, v.Index()+1)
  234. if err != nil {
  235. log.Println("remove error:", err)
  236. return tmpErr
  237. }
  238. select {
  239. case v := <-w.EventChan:
  240. if v.Action == store.Delete {
  241. return nil
  242. }
  243. log.Println("remove error: action =", v.Action)
  244. return tmpErr
  245. case <-time.After(6 * defaultHeartbeat * s.tickDuration):
  246. w.Remove()
  247. log.Println("remove error: wait timeout")
  248. return tmpErr
  249. }
  250. }
  251. func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  252. switch s.mode {
  253. case participant:
  254. s.participantHandler.ServeHTTP(w, r)
  255. case standby:
  256. s.standbyHandler.ServeHTTP(w, r)
  257. case stop:
  258. http.Error(w, "server is stopped", http.StatusInternalServerError)
  259. }
  260. }
  261. func (s *Server) initParticipant(node *v2Raft) {
  262. s.proposal = make(chan v2Proposal, maxBufferedProposal)
  263. s.node = node
  264. s.addNodeC = make(chan raft.Config, 1)
  265. s.removeNodeC = make(chan raft.Config, 1)
  266. s.t.start()
  267. }
  268. func (s *Server) initStandby(leader int64, leaderAddr string, conf *config.ClusterConfig) {
  269. s.leader = leader
  270. s.leaderAddr = leaderAddr
  271. s.clusterConf = conf
  272. }
  273. func (s *Server) run() {
  274. for {
  275. select {
  276. case s.modeC <- s.mode:
  277. default:
  278. }
  279. switch s.mode {
  280. case participant:
  281. s.runParticipant()
  282. case standby:
  283. s.runStandby()
  284. case stop:
  285. return
  286. default:
  287. panic("unsupport mode")
  288. }
  289. }
  290. }
  291. func (s *Server) runParticipant() {
  292. defer func() {
  293. s.node.StopProposalWaiters()
  294. s.t.stop()
  295. }()
  296. node := s.node
  297. recv := s.t.recv
  298. ticker := time.NewTicker(s.tickDuration)
  299. v2SyncTicker := time.NewTicker(time.Millisecond * 500)
  300. var proposal chan v2Proposal
  301. var addNodeC, removeNodeC chan raft.Config
  302. for {
  303. if node.HasLeader() {
  304. proposal = s.proposal
  305. addNodeC = s.addNodeC
  306. removeNodeC = s.removeNodeC
  307. } else {
  308. proposal = nil
  309. addNodeC = nil
  310. removeNodeC = nil
  311. }
  312. select {
  313. case p := <-proposal:
  314. node.Propose(p)
  315. case c := <-addNodeC:
  316. node.UpdateConf(raft.AddNode, &c)
  317. case c := <-removeNodeC:
  318. node.UpdateConf(raft.RemoveNode, &c)
  319. case msg := <-recv:
  320. node.Step(*msg)
  321. case <-ticker.C:
  322. node.Tick()
  323. case <-v2SyncTicker.C:
  324. node.Sync()
  325. case <-s.stop:
  326. log.Printf("Node: %d stopped\n", s.id)
  327. return
  328. }
  329. s.apply(node.Next())
  330. s.send(node.Msgs())
  331. if node.IsRemoved() {
  332. break
  333. }
  334. }
  335. log.Printf("Node: %d removed to standby mode\n", s.id)
  336. leader := noneId
  337. leaderAddr := ""
  338. if s.node.HasLeader() && !s.node.IsLeader() {
  339. leader = s.node.Leader()
  340. leaderAddr = s.fetchAddrFromStore(s.leader)
  341. }
  342. conf := s.ClusterConfig()
  343. s.initStandby(leader, leaderAddr, conf)
  344. s.mode = standby
  345. return
  346. }
  347. func (s *Server) runStandby() {
  348. syncDuration := time.Duration(int64(s.clusterConf.SyncInterval * float64(time.Second)))
  349. for {
  350. select {
  351. case <-time.After(syncDuration):
  352. case <-s.stop:
  353. log.Printf("Node: %d stopped\n", s.id)
  354. return
  355. }
  356. if err := s.syncCluster(); err != nil {
  357. continue
  358. }
  359. if err := s.standbyJoin(s.leaderAddr); err != nil {
  360. continue
  361. }
  362. break
  363. }
  364. log.Printf("Node: %d removed to participant mode\n", s.id)
  365. // TODO(yichengq): use old v2Raft
  366. // 1. reject proposal in leader state when sm is removed
  367. // 2. record removeIndex in node to ignore msgDenial and old removal
  368. s.Store = store.New()
  369. node := &v2Raft{
  370. Node: raft.New(s.id, defaultHeartbeat, defaultElection),
  371. result: make(map[wait]chan interface{}),
  372. }
  373. s.initParticipant(node)
  374. s.mode = participant
  375. return
  376. }
  377. func (s *Server) apply(ents []raft.Entry) {
  378. offset := s.node.Applied() - int64(len(ents)) + 1
  379. for i, ent := range ents {
  380. switch ent.Type {
  381. // expose raft entry type
  382. case raft.Normal:
  383. if len(ent.Data) == 0 {
  384. continue
  385. }
  386. s.v2apply(offset+int64(i), ent)
  387. case raft.AddNode:
  388. cfg := new(raft.Config)
  389. if err := json.Unmarshal(ent.Data, cfg); err != nil {
  390. log.Println(err)
  391. break
  392. }
  393. if err := s.t.set(cfg.NodeId, cfg.Addr); err != nil {
  394. log.Println(err)
  395. break
  396. }
  397. log.Printf("Add Node %x %v %v\n", cfg.NodeId, cfg.Addr, string(cfg.Context))
  398. p := path.Join(v2machineKVPrefix, fmt.Sprint(cfg.NodeId))
  399. if _, err := s.Store.Set(p, false, fmt.Sprintf("raft=%v&etcd=%v", cfg.Addr, string(cfg.Context)), store.Permanent); err == nil {
  400. s.nodes[cfg.Addr] = true
  401. }
  402. case raft.RemoveNode:
  403. cfg := new(raft.Config)
  404. if err := json.Unmarshal(ent.Data, cfg); err != nil {
  405. log.Println(err)
  406. break
  407. }
  408. log.Printf("Remove Node %x\n", cfg.NodeId)
  409. delete(s.nodes, s.fetchAddrFromStore(cfg.NodeId))
  410. p := path.Join(v2machineKVPrefix, fmt.Sprint(cfg.NodeId))
  411. s.Store.Delete(p, false, false)
  412. default:
  413. panic("unimplemented")
  414. }
  415. }
  416. }
  417. func (s *Server) send(msgs []raft.Message) {
  418. for i := range msgs {
  419. data, err := json.Marshal(msgs[i])
  420. if err != nil {
  421. // todo(xiangli): error handling
  422. log.Fatal(err)
  423. }
  424. // todo(xiangli): reuse routines and limit the number of sending routines
  425. // sync.Pool?
  426. go func(i int) {
  427. var err error
  428. if err = s.t.sendTo(msgs[i].To, data); err == nil {
  429. return
  430. }
  431. if err == errUnknownNode {
  432. err = s.fetchAddr(msgs[i].To)
  433. }
  434. if err == nil {
  435. err = s.t.sendTo(msgs[i].To, data)
  436. }
  437. if err != nil {
  438. log.Println(err)
  439. }
  440. }(i)
  441. }
  442. }
  443. func (s *Server) setClusterConfig(c *config.ClusterConfig) error {
  444. b, err := json.Marshal(c)
  445. if err != nil {
  446. return err
  447. }
  448. if _, err := s.Set(v2configKVPrefix, false, string(b), store.Permanent); err != nil {
  449. return err
  450. }
  451. return nil
  452. }
  453. func (s *Server) fetchAddr(nodeId int64) error {
  454. for seed := range s.nodes {
  455. if err := s.t.fetchAddr(seed, nodeId); err == nil {
  456. return nil
  457. }
  458. }
  459. return fmt.Errorf("cannot fetch the address of node %d", nodeId)
  460. }
  461. func (s *Server) fetchAddrFromStore(nodeId int64) string {
  462. p := path.Join(v2machineKVPrefix, fmt.Sprint(nodeId))
  463. if ev, err := s.Get(p, false, false); err == nil {
  464. if m, err := url.ParseQuery(*ev.Node.Value); err == nil {
  465. return m["raft"][0]
  466. }
  467. }
  468. return ""
  469. }
  470. func (s *Server) standbyJoin(addr string) error {
  471. if s.clusterConf.ActiveSize <= len(s.nodes) {
  472. return fmt.Errorf("full cluster")
  473. }
  474. info := &context{
  475. MinVersion: store.MinVersion(),
  476. MaxVersion: store.MaxVersion(),
  477. ClientURL: s.pubAddr,
  478. PeerURL: s.raftPubAddr,
  479. }
  480. if err := s.client.AddMachine(s.leaderAddr, fmt.Sprint(s.id), info); err != nil {
  481. return err
  482. }
  483. return nil
  484. }