server.go 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270
  1. package raft
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "hash/crc32"
  7. "io"
  8. "io/ioutil"
  9. "os"
  10. "path"
  11. "sort"
  12. "sync"
  13. "time"
  14. )
  15. //------------------------------------------------------------------------------
  16. //
  17. // Constants
  18. //
  19. //------------------------------------------------------------------------------
  20. const (
  21. Stopped = "stopped"
  22. Follower = "follower"
  23. Candidate = "candidate"
  24. Leader = "leader"
  25. Snapshotting = "snapshotting"
  26. )
  27. const (
  28. MaxLogEntriesPerRequest = 2000
  29. NumberOfLogEntriesAfterSnapshot = 200
  30. )
  31. const (
  32. DefaultHeartbeatTimeout = 50 * time.Millisecond
  33. DefaultElectionTimeout = 150 * time.Millisecond
  34. )
  35. var stopValue interface{}
  36. //------------------------------------------------------------------------------
  37. //
  38. // Errors
  39. //
  40. //------------------------------------------------------------------------------
  41. var NotLeaderError = errors.New("raft.Server: Not current leader")
  42. var DuplicatePeerError = errors.New("raft.Server: Duplicate peer")
  43. var CommandTimeoutError = errors.New("raft: Command timeout")
  44. //------------------------------------------------------------------------------
  45. //
  46. // Typedefs
  47. //
  48. //------------------------------------------------------------------------------
  49. // A server is involved in the consensus protocol and can act as a follower,
  50. // candidate or a leader.
  51. type Server struct {
  52. name string
  53. path string
  54. state string
  55. transporter Transporter
  56. context interface{}
  57. currentTerm uint64
  58. votedFor string
  59. log *Log
  60. leader string
  61. peers map[string]*Peer
  62. mutex sync.RWMutex
  63. syncedPeer map[string]bool
  64. c chan *event
  65. electionTimeout time.Duration
  66. heartbeatTimeout time.Duration
  67. currentSnapshot *Snapshot
  68. lastSnapshot *Snapshot
  69. stateMachine StateMachine
  70. maxLogEntriesPerRequest uint64
  71. confFile *os.File
  72. }
  73. // An event to be processed by the server's event loop.
  74. type event struct {
  75. target interface{}
  76. returnValue interface{}
  77. c chan error
  78. }
  79. //------------------------------------------------------------------------------
  80. //
  81. // Constructor
  82. //
  83. //------------------------------------------------------------------------------
  84. // Creates a new server with a log at the given path.
  85. func NewServer(name string, path string, transporter Transporter, stateMachine StateMachine, context interface{}) (*Server, error) {
  86. if name == "" {
  87. return nil, errors.New("raft.Server: Name cannot be blank")
  88. }
  89. if transporter == nil {
  90. panic("raft: Transporter required")
  91. }
  92. s := &Server{
  93. name: name,
  94. path: path,
  95. transporter: transporter,
  96. stateMachine: stateMachine,
  97. context: context,
  98. state: Stopped,
  99. peers: make(map[string]*Peer),
  100. log: newLog(),
  101. c: make(chan *event, 256),
  102. electionTimeout: DefaultElectionTimeout,
  103. heartbeatTimeout: DefaultHeartbeatTimeout,
  104. maxLogEntriesPerRequest: MaxLogEntriesPerRequest,
  105. }
  106. // Setup apply function.
  107. s.log.ApplyFunc = func(c Command) (interface{}, error) {
  108. result, err := c.Apply(s)
  109. return result, err
  110. }
  111. return s, nil
  112. }
  113. //------------------------------------------------------------------------------
  114. //
  115. // Accessors
  116. //
  117. //------------------------------------------------------------------------------
  118. //--------------------------------------
  119. // General
  120. //--------------------------------------
  121. // Retrieves the name of the server.
  122. func (s *Server) Name() string {
  123. return s.name
  124. }
  125. // Retrieves the storage path for the server.
  126. func (s *Server) Path() string {
  127. return s.path
  128. }
  129. // The name of the current leader.
  130. func (s *Server) Leader() string {
  131. return s.leader
  132. }
  133. // Retrieves a copy of the peer data.
  134. func (s *Server) Peers() map[string]*Peer {
  135. s.mutex.Lock()
  136. defer s.mutex.Unlock()
  137. peers := make(map[string]*Peer)
  138. for name, peer := range s.peers {
  139. peers[name] = peer.clone()
  140. }
  141. return peers
  142. }
  143. // Retrieves the object that transports requests.
  144. func (s *Server) Transporter() Transporter {
  145. s.mutex.RLock()
  146. defer s.mutex.RUnlock()
  147. return s.transporter
  148. }
  149. func (s *Server) SetTransporter(t Transporter) {
  150. s.mutex.Lock()
  151. defer s.mutex.Unlock()
  152. s.transporter = t
  153. }
  154. // Retrieves the context passed into the constructor.
  155. func (s *Server) Context() interface{} {
  156. return s.context
  157. }
  158. // Retrieves the log path for the server.
  159. func (s *Server) LogPath() string {
  160. return path.Join(s.path, "log")
  161. }
  162. // Retrieves the current state of the server.
  163. func (s *Server) State() string {
  164. s.mutex.RLock()
  165. defer s.mutex.RUnlock()
  166. return s.state
  167. }
  168. // Sets the state of the server.
  169. func (s *Server) setState(state string) {
  170. s.mutex.Lock()
  171. defer s.mutex.Unlock()
  172. s.state = state
  173. if state == Leader {
  174. s.leader = s.Name()
  175. }
  176. }
  177. // Retrieves the current term of the server.
  178. func (s *Server) Term() uint64 {
  179. return s.currentTerm
  180. }
  181. // Retrieves the current commit index of the server.
  182. func (s *Server) CommitIndex() uint64 {
  183. return s.log.commitIndex
  184. }
  185. // Retrieves the name of the candidate this server voted for in this term.
  186. func (s *Server) VotedFor() string {
  187. return s.votedFor
  188. }
  189. // Retrieves whether the server's log has no entries.
  190. func (s *Server) IsLogEmpty() bool {
  191. return s.log.isEmpty()
  192. }
  193. // A list of all the log entries. This should only be used for debugging purposes.
  194. func (s *Server) LogEntries() []*LogEntry {
  195. return s.log.entries
  196. }
  197. // A reference to the command name of the last entry.
  198. func (s *Server) LastCommandName() string {
  199. return s.log.lastCommandName()
  200. }
  201. // Get the state of the server for debugging
  202. func (s *Server) GetState() string {
  203. s.mutex.RLock()
  204. defer s.mutex.RUnlock()
  205. return fmt.Sprintf("Name: %s, State: %s, Term: %v, CommitedIndex: %v ", s.name, s.state, s.currentTerm, s.log.commitIndex)
  206. }
  207. // Check if the server is promotable
  208. func (s *Server) promotable() bool {
  209. return s.log.currentIndex() > 0
  210. }
  211. //--------------------------------------
  212. // Membership
  213. //--------------------------------------
  214. // Retrieves the number of member servers in the consensus.
  215. func (s *Server) MemberCount() int {
  216. s.mutex.Lock()
  217. defer s.mutex.Unlock()
  218. return len(s.peers) + 1
  219. }
  220. // Retrieves the number of servers required to make a quorum.
  221. func (s *Server) QuorumSize() int {
  222. return (s.MemberCount() / 2) + 1
  223. }
  224. //--------------------------------------
  225. // Election timeout
  226. //--------------------------------------
  227. // Retrieves the election timeout.
  228. func (s *Server) ElectionTimeout() time.Duration {
  229. return s.electionTimeout
  230. }
  231. // Sets the election timeout.
  232. func (s *Server) SetElectionTimeout(duration time.Duration) {
  233. s.electionTimeout = duration
  234. }
  235. //--------------------------------------
  236. // Heartbeat timeout
  237. //--------------------------------------
  238. // Retrieves the heartbeat timeout.
  239. func (s *Server) HeartbeatTimeout() time.Duration {
  240. return s.heartbeatTimeout
  241. }
  242. // Sets the heartbeat timeout.
  243. func (s *Server) SetHeartbeatTimeout(duration time.Duration) {
  244. s.mutex.Lock()
  245. defer s.mutex.Unlock()
  246. s.heartbeatTimeout = duration
  247. for _, peer := range s.peers {
  248. peer.setHeartbeatTimeout(duration)
  249. }
  250. }
  251. //------------------------------------------------------------------------------
  252. //
  253. // Methods
  254. //
  255. //------------------------------------------------------------------------------
  256. //--------------------------------------
  257. // Initialization
  258. //--------------------------------------
  259. // Reg the NOPCommand
  260. func init() {
  261. RegisterCommand(&NOPCommand{})
  262. RegisterCommand(&DefaultJoinCommand{})
  263. RegisterCommand(&DefaultLeaveCommand{})
  264. }
  265. // Start as follow
  266. // If log entries exist then allow promotion to candidate if no AEs received.
  267. // If no log entries exist then wait for AEs from another node.
  268. // If no log entries exist and a self-join command is issued then
  269. // immediately become leader and commit entry.
  270. func (s *Server) Start() error {
  271. // Exit if the server is already running.
  272. if s.state != Stopped {
  273. return errors.New("raft.Server: Server already running")
  274. }
  275. // Create snapshot directory if not exist
  276. os.Mkdir(path.Join(s.path, "snapshot"), 0700)
  277. // Initialize the log and load it up.
  278. if err := s.log.open(s.LogPath()); err != nil {
  279. s.debugln("raft: Log error: ", err)
  280. return fmt.Errorf("raft: Initialization error: %s", err)
  281. }
  282. if err := s.readConf(); err != nil {
  283. s.debugln("raft: Conf file error: ", err)
  284. return fmt.Errorf("raft: Initialization error: %s", err)
  285. }
  286. // Update the term to the last term in the log.
  287. _, s.currentTerm = s.log.lastInfo()
  288. s.setState(Follower)
  289. // If no log entries exist then
  290. // 1. wait for AEs from another node
  291. // 2. wait for self-join command
  292. // to set itself promotable
  293. if !s.promotable() {
  294. s.debugln("start as a new raft server")
  295. // If log entries exist then allow promotion to candidate
  296. // if no AEs received.
  297. } else {
  298. s.debugln("start from previous saved state")
  299. }
  300. debugln(s.GetState())
  301. go s.loop()
  302. return nil
  303. }
  304. // Read the configuration for the server.
  305. func (s *Server) readConf() error {
  306. var err error
  307. confPath := path.Join(s.path, "conf")
  308. s.debugln("readConf.open ", confPath)
  309. // open conf file
  310. s.confFile, err = os.OpenFile(confPath, os.O_RDWR, 0600)
  311. if err != nil {
  312. if os.IsNotExist(err) {
  313. s.confFile, err = os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0600)
  314. debugln("readConf.create ", confPath)
  315. if err != nil {
  316. return err
  317. }
  318. }
  319. return err
  320. }
  321. peerNames := make([]string, 0)
  322. for {
  323. var peerName string
  324. _, err = fmt.Fscanf(s.confFile, "%s\n", &peerName)
  325. if err != nil {
  326. if err == io.EOF {
  327. s.debugln("server.peer.conf: finish")
  328. break
  329. }
  330. return err
  331. }
  332. s.debugln("server.peer.conf.read: ", peerName)
  333. peerNames = append(peerNames, peerName)
  334. }
  335. s.confFile.Truncate(0)
  336. s.confFile.Seek(0, os.SEEK_SET)
  337. for _, peerName := range peerNames {
  338. s.AddPeer(peerName)
  339. }
  340. return nil
  341. }
  342. // Shuts down the server.
  343. func (s *Server) Stop() {
  344. s.send(&stopValue)
  345. s.mutex.Lock()
  346. s.log.close()
  347. s.mutex.Unlock()
  348. }
  349. // Checks if the server is currently running.
  350. func (s *Server) Running() bool {
  351. s.mutex.RLock()
  352. defer s.mutex.RUnlock()
  353. return s.state != Stopped
  354. }
  355. //--------------------------------------
  356. // Term
  357. //--------------------------------------
  358. // Sets the current term for the server. This is only used when an external
  359. // current term is found.
  360. func (s *Server) setCurrentTerm(term uint64, leaderName string, append bool) {
  361. s.mutex.Lock()
  362. defer s.mutex.Unlock()
  363. // update the term and clear vote for
  364. if term > s.currentTerm {
  365. s.state = Follower
  366. s.currentTerm = term
  367. s.leader = leaderName
  368. s.votedFor = ""
  369. return
  370. }
  371. // discover new leader when candidate
  372. // save leader name when follower
  373. if term == s.currentTerm && s.state != Leader && append {
  374. s.state = Follower
  375. s.leader = leaderName
  376. }
  377. }
  378. //--------------------------------------
  379. // Event Loop
  380. //--------------------------------------
  381. // ________
  382. // --|Snapshot| timeout
  383. // | -------- ______
  384. // recover | ^ | |
  385. // snapshot / | |snapshot | |
  386. // higher | | v | recv majority votes
  387. // term | -------- timeout ----------- -----------
  388. // |-> |Follower| ----------> | Candidate |--------------------> | Leader |
  389. // -------- ----------- -----------
  390. // ^ higher term/ | higher term |
  391. // | new leader | |
  392. // |_______________________|____________________________________ |
  393. // The main event loop for the server
  394. func (s *Server) loop() {
  395. defer s.debugln("server.loop.end")
  396. for {
  397. state := s.State()
  398. s.debugln("server.loop.run ", state)
  399. switch state {
  400. case Follower:
  401. s.followerLoop()
  402. case Candidate:
  403. s.candidateLoop()
  404. case Leader:
  405. s.leaderLoop()
  406. case Snapshotting:
  407. s.snapshotLoop()
  408. case Stopped:
  409. return
  410. }
  411. }
  412. }
  413. // Sends an event to the event loop to be processed. The function will wait
  414. // until the event is actually processed before returning.
  415. func (s *Server) send(value interface{}) (interface{}, error) {
  416. event := s.sendAsync(value)
  417. err := <-event.c
  418. return event.returnValue, err
  419. }
  420. func (s *Server) sendAsync(value interface{}) *event {
  421. event := &event{target: value, c: make(chan error, 1)}
  422. s.c <- event
  423. return event
  424. }
  425. // The event loop that is run when the server is in a Follower state.
  426. // Responds to RPCs from candidates and leaders.
  427. // Converts to candidate if election timeout elapses without either:
  428. // 1.Receiving valid AppendEntries RPC, or
  429. // 2.Granting vote to candidate
  430. func (s *Server) followerLoop() {
  431. s.setState(Follower)
  432. timeoutChan := afterBetween(s.ElectionTimeout(), s.ElectionTimeout()*2)
  433. for {
  434. var err error
  435. update := false
  436. select {
  437. case e := <-s.c:
  438. if e.target == &stopValue {
  439. s.setState(Stopped)
  440. } else if command, ok := e.target.(JoinCommand); ok {
  441. //If no log entries exist and a self-join command is issued
  442. //then immediately become leader and commit entry.
  443. if s.log.currentIndex() == 0 && command.NodeName() == s.Name() {
  444. s.debugln("selfjoin and promote to leader")
  445. s.setState(Leader)
  446. s.processCommand(command, e)
  447. } else {
  448. err = NotLeaderError
  449. }
  450. } else if req, ok := e.target.(*AppendEntriesRequest); ok {
  451. e.returnValue, update = s.processAppendEntriesRequest(req)
  452. } else if req, ok := e.target.(*RequestVoteRequest); ok {
  453. e.returnValue, update = s.processRequestVoteRequest(req)
  454. } else if req, ok := e.target.(*SnapshotRequest); ok {
  455. e.returnValue = s.processSnapshotRequest(req)
  456. } else {
  457. err = NotLeaderError
  458. }
  459. // Callback to event.
  460. e.c <- err
  461. case <-timeoutChan:
  462. // only allow synced follower to promote to candidate
  463. if s.promotable() {
  464. s.setState(Candidate)
  465. } else {
  466. update = true
  467. }
  468. }
  469. // Converts to candidate if election timeout elapses without either:
  470. // 1.Receiving valid AppendEntries RPC, or
  471. // 2.Granting vote to candidate
  472. if update {
  473. timeoutChan = afterBetween(s.ElectionTimeout(), s.ElectionTimeout()*2)
  474. }
  475. // Exit loop on state change.
  476. if s.State() != Follower {
  477. break
  478. }
  479. }
  480. }
  481. // The event loop that is run when the server is in a Candidate state.
  482. func (s *Server) candidateLoop() {
  483. lastLogIndex, lastLogTerm := s.log.lastInfo()
  484. s.leader = ""
  485. for {
  486. // Increment current term, vote for self.
  487. s.currentTerm++
  488. s.votedFor = s.name
  489. // Send RequestVote RPCs to all other servers.
  490. respChan := make(chan *RequestVoteResponse, len(s.peers))
  491. for _, peer := range s.peers {
  492. go peer.sendVoteRequest(newRequestVoteRequest(s.currentTerm, s.name, lastLogIndex, lastLogTerm), respChan)
  493. }
  494. // Wait for either:
  495. // * Votes received from majority of servers: become leader
  496. // * AppendEntries RPC received from new leader: step down.
  497. // * Election timeout elapses without election resolution: increment term, start new election
  498. // * Discover higher term: step down (§5.1)
  499. votesGranted := 1
  500. timeoutChan := afterBetween(s.ElectionTimeout(), s.ElectionTimeout()*2)
  501. timeout := false
  502. for {
  503. // If we received enough votes then stop waiting for more votes.
  504. s.debugln("server.candidate.votes: ", votesGranted, " quorum:", s.QuorumSize())
  505. if votesGranted >= s.QuorumSize() {
  506. s.setState(Leader)
  507. break
  508. }
  509. // Collect votes from peers.
  510. select {
  511. case resp := <-respChan:
  512. if resp.VoteGranted {
  513. s.debugln("server.candidate.vote.granted: ", votesGranted)
  514. votesGranted++
  515. } else if resp.Term > s.currentTerm {
  516. s.debugln("server.candidate.vote.failed")
  517. s.setCurrentTerm(resp.Term, "", false)
  518. } else {
  519. s.debugln("server.candidate.vote: denied")
  520. }
  521. case e := <-s.c:
  522. var err error
  523. if e.target == &stopValue {
  524. s.setState(Stopped)
  525. } else if _, ok := e.target.(Command); ok {
  526. err = NotLeaderError
  527. } else if req, ok := e.target.(*AppendEntriesRequest); ok {
  528. e.returnValue, _ = s.processAppendEntriesRequest(req)
  529. } else if req, ok := e.target.(*RequestVoteRequest); ok {
  530. e.returnValue, _ = s.processRequestVoteRequest(req)
  531. }
  532. // Callback to event.
  533. e.c <- err
  534. case <-timeoutChan:
  535. timeout = true
  536. }
  537. // both process AER and RVR can make the server to follower
  538. // also break when timeout happens
  539. if s.State() != Candidate || timeout {
  540. break
  541. }
  542. }
  543. // break when we are not candidate
  544. if s.State() != Candidate {
  545. break
  546. }
  547. // continue when timeout happened
  548. }
  549. }
  550. // The event loop that is run when the server is in a Candidate state.
  551. func (s *Server) leaderLoop() {
  552. s.setState(Leader)
  553. s.syncedPeer = make(map[string]bool)
  554. logIndex, _ := s.log.lastInfo()
  555. // Update the peers prevLogIndex to leader's lastLogIndex and start heartbeat.
  556. s.debugln("leaderLoop.set.PrevIndex to ", logIndex)
  557. for _, peer := range s.peers {
  558. peer.setPrevLogIndex(logIndex)
  559. peer.startHeartbeat()
  560. }
  561. go s.Do(NOPCommand{})
  562. // Begin to collect response from followers
  563. for {
  564. var err error
  565. select {
  566. case e := <-s.c:
  567. if e.target == &stopValue {
  568. s.setState(Stopped)
  569. } else if command, ok := e.target.(Command); ok {
  570. s.processCommand(command, e)
  571. continue
  572. } else if req, ok := e.target.(*AppendEntriesRequest); ok {
  573. e.returnValue, _ = s.processAppendEntriesRequest(req)
  574. } else if resp, ok := e.target.(*AppendEntriesResponse); ok {
  575. s.processAppendEntriesResponse(resp)
  576. } else if req, ok := e.target.(*RequestVoteRequest); ok {
  577. e.returnValue, _ = s.processRequestVoteRequest(req)
  578. }
  579. // Callback to event.
  580. e.c <- err
  581. }
  582. // Exit loop on state change.
  583. if s.State() != Leader {
  584. break
  585. }
  586. }
  587. // Stop all peers.
  588. for _, peer := range s.peers {
  589. peer.stopHeartbeat()
  590. }
  591. s.syncedPeer = nil
  592. }
  593. func (s *Server) snapshotLoop() {
  594. s.setState(Snapshotting)
  595. for {
  596. var err error
  597. e := <-s.c
  598. if e.target == &stopValue {
  599. s.setState(Stopped)
  600. } else if _, ok := e.target.(Command); ok {
  601. err = NotLeaderError
  602. } else if req, ok := e.target.(*AppendEntriesRequest); ok {
  603. e.returnValue, _ = s.processAppendEntriesRequest(req)
  604. } else if req, ok := e.target.(*RequestVoteRequest); ok {
  605. e.returnValue, _ = s.processRequestVoteRequest(req)
  606. } else if req, ok := e.target.(*SnapshotRecoveryRequest); ok {
  607. e.returnValue = s.processSnapshotRecoveryRequest(req)
  608. }
  609. // Callback to event.
  610. e.c <- err
  611. // Exit loop on state change.
  612. if s.State() != Snapshotting {
  613. break
  614. }
  615. }
  616. }
  617. //--------------------------------------
  618. // Commands
  619. //--------------------------------------
  620. // Attempts to execute a command and replicate it. The function will return
  621. // when the command has been successfully committed or an error has occurred.
  622. func (s *Server) Do(command Command) (interface{}, error) {
  623. return s.send(command)
  624. }
  625. // Processes a command.
  626. func (s *Server) processCommand(command Command, e *event) {
  627. s.debugln("server.command.process")
  628. // Create an entry for the command in the log.
  629. entry, err := s.log.createEntry(s.currentTerm, command)
  630. if err != nil {
  631. s.debugln("server.command.log.entry.error:", err)
  632. e.c <- err
  633. return
  634. }
  635. if err := s.log.appendEntry(entry); err != nil {
  636. s.debugln("server.command.log.error:", err)
  637. e.c <- err
  638. return
  639. }
  640. // Issue a callback for the entry once it's committed.
  641. go func() {
  642. // Wait for the entry to be committed.
  643. select {
  644. case <-entry.commit:
  645. var err error
  646. s.debugln("server.command.commit")
  647. e.returnValue, err = s.log.getEntryResult(entry, true)
  648. e.c <- err
  649. case <-time.After(time.Second):
  650. s.debugln("server.command.timeout")
  651. e.c <- CommandTimeoutError
  652. }
  653. }()
  654. // Issue an append entries response for the server.
  655. resp := newAppendEntriesResponse(s.currentTerm, true, s.log.currentIndex(), s.log.CommitIndex())
  656. resp.append = true
  657. resp.peer = s.Name()
  658. // this must be async
  659. // sendAsync is not really async every time
  660. // when the sending speed of the user is larger than
  661. // the processing speed of the server, the buffered channel
  662. // will be full. Then sendAsync will become sync, which will
  663. // cause deadlock here.
  664. // so we use a goroutine to avoid the deadlock
  665. go s.sendAsync(resp)
  666. }
  667. //--------------------------------------
  668. // Append Entries
  669. //--------------------------------------
  670. // Appends zero or more log entry from the leader to this server.
  671. func (s *Server) AppendEntries(req *AppendEntriesRequest) *AppendEntriesResponse {
  672. ret, _ := s.send(req)
  673. resp, _ := ret.(*AppendEntriesResponse)
  674. return resp
  675. }
  676. // Processes the "append entries" request.
  677. func (s *Server) processAppendEntriesRequest(req *AppendEntriesRequest) (*AppendEntriesResponse, bool) {
  678. s.traceln("server.ae.process")
  679. if req.Term < s.currentTerm {
  680. s.debugln("server.ae.error: stale term")
  681. return newAppendEntriesResponse(s.currentTerm, false, s.log.currentIndex(), s.log.CommitIndex()), false
  682. }
  683. // Update term and leader.
  684. s.setCurrentTerm(req.Term, req.LeaderName, true)
  685. // Reject if log doesn't contain a matching previous entry.
  686. if err := s.log.truncate(req.PrevLogIndex, req.PrevLogTerm); err != nil {
  687. s.debugln("server.ae.truncate.error: ", err)
  688. return newAppendEntriesResponse(s.currentTerm, false, s.log.currentIndex(), s.log.CommitIndex()), true
  689. }
  690. // Append entries to the log.
  691. if err := s.log.appendEntries(req.Entries); err != nil {
  692. s.debugln("server.ae.append.error: ", err)
  693. return newAppendEntriesResponse(s.currentTerm, false, s.log.currentIndex(), s.log.CommitIndex()), true
  694. }
  695. // Commit up to the commit index.
  696. if err := s.log.setCommitIndex(req.CommitIndex); err != nil {
  697. s.debugln("server.ae.commit.error: ", err)
  698. return newAppendEntriesResponse(s.currentTerm, false, s.log.currentIndex(), s.log.CommitIndex()), true
  699. }
  700. // once the server appended and commited all the log entries from the leader
  701. return newAppendEntriesResponse(s.currentTerm, true, s.log.currentIndex(), s.log.CommitIndex()), true
  702. }
  703. // Processes the "append entries" response from the peer. This is only
  704. // processed when the server is a leader. Responses received during other
  705. // states are dropped.
  706. func (s *Server) processAppendEntriesResponse(resp *AppendEntriesResponse) {
  707. // If we find a higher term then change to a follower and exit.
  708. if resp.Term > s.currentTerm {
  709. s.setCurrentTerm(resp.Term, "", false)
  710. return
  711. }
  712. // panic response if it's not successful.
  713. if !resp.Success {
  714. return
  715. }
  716. // if one peer successfully append a log from the leader term,
  717. // we add it to the synced list
  718. if resp.append == true {
  719. s.syncedPeer[resp.peer] = true
  720. }
  721. // Increment the commit count to make sure we have a quorum before committing.
  722. if len(s.syncedPeer) < s.QuorumSize() {
  723. return
  724. }
  725. // Determine the committed index that a majority has.
  726. var indices []uint64
  727. indices = append(indices, s.log.currentIndex())
  728. for _, peer := range s.peers {
  729. indices = append(indices, peer.getPrevLogIndex())
  730. }
  731. sort.Sort(uint64Slice(indices))
  732. // We can commit up to the index which the majority of the members have appended.
  733. commitIndex := indices[s.QuorumSize()-1]
  734. committedIndex := s.log.commitIndex
  735. if commitIndex > committedIndex {
  736. s.log.setCommitIndex(commitIndex)
  737. s.debugln("commit index ", commitIndex)
  738. for i := committedIndex; i < commitIndex; i++ {
  739. if entry := s.log.getEntry(i + 1); entry != nil {
  740. // if the leader is a new one and the entry came from the
  741. // old leader, the commit channel will be nil and no go routine
  742. // is waiting from this channel
  743. // if we try to send to it, the new leader will get stuck
  744. if entry.commit != nil {
  745. select {
  746. case entry.commit <- true:
  747. default:
  748. panic("server unable to send signal to commit channel")
  749. }
  750. }
  751. }
  752. }
  753. }
  754. }
  755. //--------------------------------------
  756. // Request Vote
  757. //--------------------------------------
  758. // Requests a vote from a server. A vote can be obtained if the vote's term is
  759. // at the server's current term and the server has not made a vote yet. A vote
  760. // can also be obtained if the term is greater than the server's current term.
  761. func (s *Server) RequestVote(req *RequestVoteRequest) *RequestVoteResponse {
  762. ret, _ := s.send(req)
  763. resp, _ := ret.(*RequestVoteResponse)
  764. return resp
  765. }
  766. // Processes a "request vote" request.
  767. func (s *Server) processRequestVoteRequest(req *RequestVoteRequest) (*RequestVoteResponse, bool) {
  768. // If the request is coming from an old term then reject it.
  769. if req.Term < s.currentTerm {
  770. s.debugln("server.rv.error: stale term")
  771. return newRequestVoteResponse(s.currentTerm, false), false
  772. }
  773. s.setCurrentTerm(req.Term, "", false)
  774. // If we've already voted for a different candidate then don't vote for this candidate.
  775. if s.votedFor != "" && s.votedFor != req.CandidateName {
  776. s.debugln("server.rv.error: duplicate vote: ", req.CandidateName,
  777. " already vote for ", s.votedFor)
  778. return newRequestVoteResponse(s.currentTerm, false), false
  779. }
  780. // If the candidate's log is not at least as up-to-date as our last log then don't vote.
  781. lastIndex, lastTerm := s.log.lastInfo()
  782. if lastIndex > req.LastLogIndex || lastTerm > req.LastLogTerm {
  783. s.debugln("server.rv.error: out of date log: ", req.CandidateName,
  784. "Index :[", lastIndex, "]", " [", req.LastLogIndex, "]",
  785. "Term :[", lastTerm, "]", " [", req.LastLogTerm, "]")
  786. return newRequestVoteResponse(s.currentTerm, false), false
  787. }
  788. // If we made it this far then cast a vote and reset our election time out.
  789. s.debugln("server.rv.vote: ", s.name, " votes for", req.CandidateName, "at term", req.Term)
  790. s.votedFor = req.CandidateName
  791. return newRequestVoteResponse(s.currentTerm, true), true
  792. }
  793. //--------------------------------------
  794. // Membership
  795. //--------------------------------------
  796. // Adds a peer to the server.
  797. func (s *Server) AddPeer(name string) error {
  798. s.debugln("server.peer.add: ", name, len(s.peers))
  799. // Do not allow peers to be added twice.
  800. if s.peers[name] != nil {
  801. return nil
  802. }
  803. // Only add the peer if it doesn't have the same name.
  804. if s.name != name {
  805. // when loading snapshot s.confFile should be nil
  806. if s.confFile != nil {
  807. _, err := fmt.Fprintln(s.confFile, name)
  808. s.debugln("server.peer.conf.write: ", name)
  809. if err != nil {
  810. return err
  811. }
  812. }
  813. peer := newPeer(s, name, s.heartbeatTimeout)
  814. if s.State() == Leader {
  815. peer.startHeartbeat()
  816. }
  817. s.peers[peer.name] = peer
  818. }
  819. return nil
  820. }
  821. // Removes a peer from the server.
  822. func (s *Server) RemovePeer(name string) error {
  823. s.debugln("server.peer.remove: ", name, len(s.peers))
  824. // Ignore removal of the server itself.
  825. if s.name == name {
  826. return nil
  827. }
  828. // Return error if peer doesn't exist.
  829. peer := s.peers[name]
  830. if peer == nil {
  831. return fmt.Errorf("raft: Peer not found: %s", name)
  832. }
  833. // TODO: Flush entries to the peer first.
  834. // Stop peer and remove it.
  835. peer.stopHeartbeat()
  836. delete(s.peers, name)
  837. s.confFile.Truncate(0)
  838. s.confFile.Seek(0, os.SEEK_SET)
  839. for peer := range s.peers {
  840. _, err := fmt.Fprintln(s.confFile, peer)
  841. if err != nil {
  842. return err
  843. }
  844. }
  845. return nil
  846. }
  847. //--------------------------------------
  848. // Log compaction
  849. //--------------------------------------
  850. // The background snapshot function
  851. func (s *Server) Snapshot() {
  852. for {
  853. // TODO: change this... to something reasonable
  854. time.Sleep(1 * time.Second)
  855. s.takeSnapshot()
  856. }
  857. }
  858. func (s *Server) takeSnapshot() error {
  859. //TODO put a snapshot mutex
  860. s.debugln("take Snapshot")
  861. if s.currentSnapshot != nil {
  862. return errors.New("handling snapshot")
  863. }
  864. lastIndex, lastTerm := s.log.commitInfo()
  865. if lastIndex == 0 {
  866. return errors.New("No logs")
  867. }
  868. path := s.SnapshotPath(lastIndex, lastTerm)
  869. var state []byte
  870. var err error
  871. if s.stateMachine != nil {
  872. state, err = s.stateMachine.Save()
  873. if err != nil {
  874. return err
  875. }
  876. } else {
  877. state = []byte{0}
  878. }
  879. var peerNames []string
  880. for _, peer := range s.peers {
  881. peerNames = append(peerNames, peer.Name())
  882. }
  883. peerNames = append(peerNames, s.Name())
  884. s.currentSnapshot = &Snapshot{lastIndex, lastTerm, peerNames, state, path}
  885. s.saveSnapshot()
  886. // We keep some log entries after the snapshot
  887. // We do not want to send the whole snapshot
  888. // to the slightly slow machines
  889. if lastIndex-s.log.startIndex > NumberOfLogEntriesAfterSnapshot {
  890. compactIndex := lastIndex - NumberOfLogEntriesAfterSnapshot
  891. compactTerm := s.log.getEntry(compactIndex).Term
  892. s.log.compact(compactIndex, compactTerm)
  893. }
  894. return nil
  895. }
  896. // Retrieves the log path for the server.
  897. func (s *Server) saveSnapshot() error {
  898. if s.currentSnapshot == nil {
  899. return errors.New("no snapshot to save")
  900. }
  901. err := s.currentSnapshot.save()
  902. if err != nil {
  903. return err
  904. }
  905. tmp := s.lastSnapshot
  906. s.lastSnapshot = s.currentSnapshot
  907. // delete the previous snapshot if there is any change
  908. if tmp != nil && !(tmp.LastIndex == s.lastSnapshot.LastIndex && tmp.LastTerm == s.lastSnapshot.LastTerm) {
  909. tmp.remove()
  910. }
  911. s.currentSnapshot = nil
  912. return nil
  913. }
  914. // Retrieves the log path for the server.
  915. func (s *Server) SnapshotPath(lastIndex uint64, lastTerm uint64) string {
  916. return path.Join(s.path, "snapshot", fmt.Sprintf("%v_%v.ss", lastTerm, lastIndex))
  917. }
  918. func (s *Server) RequestSnapshot(req *SnapshotRequest) *SnapshotResponse {
  919. ret, _ := s.send(req)
  920. resp, _ := ret.(*SnapshotResponse)
  921. return resp
  922. }
  923. func (s *Server) processSnapshotRequest(req *SnapshotRequest) *SnapshotResponse {
  924. // If the follower’s log contains an entry at the snapshot’s last index with a term
  925. // that matches the snapshot’s last term
  926. // Then the follower already has all the information found in the snapshot
  927. // and can reply false
  928. entry := s.log.getEntry(req.LastIndex)
  929. if entry != nil && entry.Term == req.LastTerm {
  930. return newSnapshotResponse(false)
  931. }
  932. s.setState(Snapshotting)
  933. return newSnapshotResponse(true)
  934. }
  935. func (s *Server) SnapshotRecoveryRequest(req *SnapshotRecoveryRequest) *SnapshotRecoveryResponse {
  936. ret, _ := s.send(req)
  937. resp, _ := ret.(*SnapshotRecoveryResponse)
  938. return resp
  939. }
  940. func (s *Server) processSnapshotRecoveryRequest(req *SnapshotRecoveryRequest) *SnapshotRecoveryResponse {
  941. s.stateMachine.Recovery(req.State)
  942. // clear the peer map
  943. s.peers = make(map[string]*Peer)
  944. // recovery the cluster configuration
  945. for _, peerName := range req.Peers {
  946. s.AddPeer(peerName)
  947. }
  948. //update term and index
  949. s.currentTerm = req.LastTerm
  950. s.log.updateCommitIndex(req.LastIndex)
  951. snapshotPath := s.SnapshotPath(req.LastIndex, req.LastTerm)
  952. s.currentSnapshot = &Snapshot{req.LastIndex, req.LastTerm, req.Peers, req.State, snapshotPath}
  953. s.saveSnapshot()
  954. // clear the previous log entries
  955. s.log.compact(req.LastIndex, req.LastTerm)
  956. return newSnapshotRecoveryResponse(req.LastTerm, true, req.LastIndex)
  957. }
  958. // Load a snapshot at restart
  959. func (s *Server) LoadSnapshot() error {
  960. dir, err := os.OpenFile(path.Join(s.path, "snapshot"), os.O_RDONLY, 0)
  961. if err != nil {
  962. return err
  963. }
  964. filenames, err := dir.Readdirnames(-1)
  965. if err != nil {
  966. dir.Close()
  967. panic(err)
  968. }
  969. dir.Close()
  970. if len(filenames) == 0 {
  971. return errors.New("no snapshot")
  972. }
  973. // not sure how many snapshot we should keep
  974. sort.Strings(filenames)
  975. snapshotPath := path.Join(s.path, "snapshot", filenames[len(filenames)-1])
  976. // should not fail
  977. file, err := os.OpenFile(snapshotPath, os.O_RDONLY, 0)
  978. defer file.Close()
  979. if err != nil {
  980. panic(err)
  981. }
  982. // TODO check checksum first
  983. var snapshotBytes []byte
  984. var checksum uint32
  985. n, err := fmt.Fscanf(file, "%08x\n", &checksum)
  986. if err != nil {
  987. return err
  988. }
  989. if n != 1 {
  990. return errors.New("Bad snapshot file")
  991. }
  992. snapshotBytes, _ = ioutil.ReadAll(file)
  993. s.debugln(string(snapshotBytes))
  994. // Generate checksum.
  995. byteChecksum := crc32.ChecksumIEEE(snapshotBytes)
  996. if uint32(checksum) != byteChecksum {
  997. s.debugln(checksum, " ", byteChecksum)
  998. return errors.New("bad snapshot file")
  999. }
  1000. err = json.Unmarshal(snapshotBytes, &s.lastSnapshot)
  1001. if err != nil {
  1002. s.debugln("unmarshal error: ", err)
  1003. return err
  1004. }
  1005. err = s.stateMachine.Recovery(s.lastSnapshot.State)
  1006. if err != nil {
  1007. s.debugln("recovery error: ", err)
  1008. return err
  1009. }
  1010. for _, peerName := range s.lastSnapshot.Peers {
  1011. s.AddPeer(peerName)
  1012. }
  1013. s.log.startTerm = s.lastSnapshot.LastTerm
  1014. s.log.startIndex = s.lastSnapshot.LastIndex
  1015. s.log.updateCommitIndex(s.lastSnapshot.LastIndex)
  1016. return err
  1017. }
  1018. //--------------------------------------
  1019. // Debugging
  1020. //--------------------------------------
  1021. func (s *Server) debugln(v ...interface{}) {
  1022. debugf("[%s Term:%d] %s", s.name, s.currentTerm, fmt.Sprintln(v...))
  1023. }
  1024. func (s *Server) traceln(v ...interface{}) {
  1025. tracef("[%s] %s", s.name, fmt.Sprintln(v...))
  1026. }