raft.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. package raft
  2. import (
  3. "errors"
  4. "fmt"
  5. "sort"
  6. "sync/atomic"
  7. )
  8. const none = -1
  9. type messageType int64
  10. const (
  11. msgHup messageType = iota
  12. msgBeat
  13. msgProp
  14. msgApp
  15. msgAppResp
  16. msgVote
  17. msgVoteResp
  18. msgSnap
  19. msgDenied
  20. )
  21. var mtmap = [...]string{
  22. msgHup: "msgHup",
  23. msgBeat: "msgBeat",
  24. msgProp: "msgProp",
  25. msgApp: "msgApp",
  26. msgAppResp: "msgAppResp",
  27. msgVote: "msgVote",
  28. msgVoteResp: "msgVoteResp",
  29. msgSnap: "msgSnap",
  30. msgDenied: "msgDenied",
  31. }
  32. func (mt messageType) String() string {
  33. return mtmap[int64(mt)]
  34. }
  35. var errNoLeader = errors.New("no leader")
  36. const (
  37. stateFollower stateType = iota
  38. stateCandidate
  39. stateLeader
  40. )
  41. type stateType int64
  42. var stmap = [...]string{
  43. stateFollower: "stateFollower",
  44. stateCandidate: "stateCandidate",
  45. stateLeader: "stateLeader",
  46. }
  47. var stepmap = [...]stepFunc{
  48. stateFollower: stepFollower,
  49. stateCandidate: stepCandidate,
  50. stateLeader: stepLeader,
  51. }
  52. func (st stateType) String() string {
  53. return stmap[int64(st)]
  54. }
  55. var EmptyState = State{}
  56. type Message struct {
  57. Type messageType
  58. ClusterId int64
  59. To int64
  60. From int64
  61. Term int64
  62. LogTerm int64
  63. Index int64
  64. Entries []Entry
  65. Commit int64
  66. Snapshot Snapshot
  67. }
  68. func (m Message) IsMsgApp() bool {
  69. return m.Type == msgApp
  70. }
  71. func (m Message) String() string {
  72. return fmt.Sprintf("type=%v from=%x to=%x term=%d logTerm=%d i=%d ci=%d len(ents)=%d",
  73. m.Type, m.From, m.To, m.Term, m.LogTerm, m.Index, m.Commit, len(m.Entries))
  74. }
  75. type index struct {
  76. match, next int64
  77. }
  78. func (in *index) update(n int64) {
  79. in.match = n
  80. in.next = n + 1
  81. }
  82. func (in *index) decr() {
  83. if in.next--; in.next < 1 {
  84. in.next = 1
  85. }
  86. }
  87. func (in *index) String() string {
  88. return fmt.Sprintf("n=%d m=%d", in.next, in.match)
  89. }
  90. // An AtomicInt is an int64 to be accessed atomically.
  91. type atomicInt int64
  92. func (i *atomicInt) Set(n int64) {
  93. atomic.StoreInt64((*int64)(i), n)
  94. }
  95. func (i *atomicInt) Get() int64 {
  96. return atomic.LoadInt64((*int64)(i))
  97. }
  98. // int64Slice implements sort interface
  99. type int64Slice []int64
  100. func (p int64Slice) Len() int { return len(p) }
  101. func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] }
  102. func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  103. type raft struct {
  104. State
  105. // --- new stuff ---
  106. name string
  107. election int
  108. heartbeat int
  109. // -----------------
  110. clusterId int64
  111. id int64
  112. // the term we are participating in at any time
  113. index atomicInt
  114. // the log
  115. raftLog *raftLog
  116. ins map[int64]*index
  117. state stateType
  118. votes map[int64]bool
  119. msgs []Message
  120. // the leader id
  121. lead atomicInt
  122. // pending reconfiguration
  123. pendingConf bool
  124. unstableState State
  125. // promotable indicates whether state machine could be promoted.
  126. // New machine has to wait until it has been added to the cluster, or it
  127. // may become the leader of the cluster without it.
  128. promotable bool
  129. }
  130. func newStateMachine(id int64, peers []int64) *raft {
  131. if id == none {
  132. panic("cannot use none id")
  133. }
  134. sm := &raft{id: id, clusterId: none, lead: none, raftLog: newLog(), ins: make(map[int64]*index)}
  135. for _, p := range peers {
  136. sm.ins[p] = &index{}
  137. }
  138. sm.reset(0)
  139. return sm
  140. }
  141. func (r *raft) hasLeader() bool { return r.state != stateCandidate }
  142. func (r *raft) propose(data []byte) {
  143. r.Step(Message{From: r.id, Type: msgProp, Entries: []Entry{{Data: data}}})
  144. }
  145. func (sm *raft) String() string {
  146. s := fmt.Sprintf(`state=%v term=%d`, sm.state, sm.Term)
  147. switch sm.state {
  148. case stateFollower:
  149. s += fmt.Sprintf(" vote=%v lead=%v", sm.Vote, sm.lead)
  150. case stateCandidate:
  151. s += fmt.Sprintf(` votes="%v"`, sm.votes)
  152. case stateLeader:
  153. s += fmt.Sprintf(` ins="%v"`, sm.ins)
  154. }
  155. return s
  156. }
  157. func (sm *raft) poll(id int64, v bool) (granted int) {
  158. if _, ok := sm.votes[id]; !ok {
  159. sm.votes[id] = v
  160. }
  161. for _, vv := range sm.votes {
  162. if vv {
  163. granted++
  164. }
  165. }
  166. return granted
  167. }
  168. // send persists state to stable storage and then sends to its mailbox.
  169. func (sm *raft) send(m Message) {
  170. m.ClusterId = sm.clusterId
  171. m.From = sm.id
  172. m.Term = sm.Term
  173. sm.msgs = append(sm.msgs, m)
  174. }
  175. // sendAppend sends RRPC, with entries to the given peer.
  176. func (sm *raft) sendAppend(to int64) {
  177. in := sm.ins[to]
  178. m := Message{}
  179. m.To = to
  180. m.Index = in.next - 1
  181. if sm.needSnapshot(m.Index) {
  182. m.Type = msgSnap
  183. m.Snapshot = sm.raftLog.snapshot
  184. } else {
  185. m.Type = msgApp
  186. m.LogTerm = sm.raftLog.term(in.next - 1)
  187. m.Entries = sm.raftLog.entries(in.next)
  188. m.Commit = sm.raftLog.committed
  189. }
  190. sm.send(m)
  191. }
  192. // sendHeartbeat sends RRPC, without entries to the given peer.
  193. func (sm *raft) sendHeartbeat(to int64) {
  194. in := sm.ins[to]
  195. index := max(in.next-1, sm.raftLog.lastIndex())
  196. m := Message{
  197. To: to,
  198. Type: msgApp,
  199. Index: index,
  200. LogTerm: sm.raftLog.term(index),
  201. Commit: sm.raftLog.committed,
  202. }
  203. sm.send(m)
  204. }
  205. // bcastAppend sends RRPC, with entries to all peers that are not up-to-date according to sm.mis.
  206. func (sm *raft) bcastAppend() {
  207. for i := range sm.ins {
  208. if i == sm.id {
  209. continue
  210. }
  211. sm.sendAppend(i)
  212. }
  213. }
  214. // bcastHeartbeat sends RRPC, without entries to all the peers.
  215. func (sm *raft) bcastHeartbeat() {
  216. for i := range sm.ins {
  217. if i == sm.id {
  218. continue
  219. }
  220. sm.sendHeartbeat(i)
  221. }
  222. }
  223. func (sm *raft) maybeCommit() bool {
  224. // TODO(bmizerany): optimize.. Currently naive
  225. mis := make(int64Slice, 0, len(sm.ins))
  226. for i := range sm.ins {
  227. mis = append(mis, sm.ins[i].match)
  228. }
  229. sort.Sort(sort.Reverse(mis))
  230. mci := mis[sm.q()-1]
  231. return sm.raftLog.maybeCommit(mci, sm.Term)
  232. }
  233. // nextEnts returns the appliable entries and updates the applied index
  234. func (sm *raft) nextEnts() (ents []Entry) {
  235. ents = sm.raftLog.nextEnts()
  236. sm.raftLog.resetNextEnts()
  237. return ents
  238. }
  239. func (sm *raft) reset(term int64) {
  240. sm.setTerm(term)
  241. sm.lead.Set(none)
  242. sm.setVote(none)
  243. sm.votes = make(map[int64]bool)
  244. for i := range sm.ins {
  245. sm.ins[i] = &index{next: sm.raftLog.lastIndex() + 1}
  246. if i == sm.id {
  247. sm.ins[i].match = sm.raftLog.lastIndex()
  248. }
  249. }
  250. }
  251. func (sm *raft) q() int {
  252. return len(sm.ins)/2 + 1
  253. }
  254. func (sm *raft) appendEntry(e Entry) {
  255. e.Term = sm.Term
  256. e.Index = sm.raftLog.lastIndex() + 1
  257. sm.LastIndex = sm.raftLog.append(sm.raftLog.lastIndex(), e)
  258. sm.ins[sm.id].update(sm.raftLog.lastIndex())
  259. sm.maybeCommit()
  260. }
  261. func (sm *raft) becomeFollower(term int64, lead int64) {
  262. sm.reset(term)
  263. sm.lead.Set(lead)
  264. sm.state = stateFollower
  265. sm.pendingConf = false
  266. }
  267. func (sm *raft) becomeCandidate() {
  268. // TODO(xiangli) remove the panic when the raft implementation is stable
  269. if sm.state == stateLeader {
  270. panic("invalid transition [leader -> candidate]")
  271. }
  272. sm.reset(sm.Term + 1)
  273. sm.setVote(sm.id)
  274. sm.state = stateCandidate
  275. }
  276. func (sm *raft) becomeLeader() {
  277. // TODO(xiangli) remove the panic when the raft implementation is stable
  278. if sm.state == stateFollower {
  279. panic("invalid transition [follower -> leader]")
  280. }
  281. sm.reset(sm.Term)
  282. sm.lead.Set(sm.id)
  283. sm.state = stateLeader
  284. for _, e := range sm.raftLog.entries(sm.raftLog.committed + 1) {
  285. if e.isConfig() {
  286. sm.pendingConf = true
  287. }
  288. }
  289. sm.appendEntry(Entry{Type: Normal, Data: nil})
  290. }
  291. func (sm *raft) ReadMessages() []Message {
  292. msgs := sm.msgs
  293. sm.msgs = make([]Message, 0)
  294. return msgs
  295. }
  296. func (sm *raft) Step(m Message) error {
  297. if m.Type == msgHup {
  298. sm.becomeCandidate()
  299. if sm.q() == sm.poll(sm.id, true) {
  300. sm.becomeLeader()
  301. }
  302. for i := range sm.ins {
  303. if i == sm.id {
  304. continue
  305. }
  306. lasti := sm.raftLog.lastIndex()
  307. sm.send(Message{To: i, Type: msgVote, Index: lasti, LogTerm: sm.raftLog.term(lasti)})
  308. }
  309. }
  310. switch {
  311. case m.Term == 0:
  312. // local message
  313. case m.Term > sm.Term:
  314. lead := m.From
  315. if m.Type == msgVote {
  316. lead = none
  317. }
  318. sm.becomeFollower(m.Term, lead)
  319. case m.Term < sm.Term:
  320. // ignore
  321. }
  322. stepmap[sm.state](sm, m)
  323. return nil
  324. }
  325. func (sm *raft) handleAppendEntries(m Message) {
  326. if sm.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...) {
  327. sm.LastIndex = sm.raftLog.lastIndex()
  328. sm.send(Message{To: m.From, Type: msgAppResp, Index: sm.raftLog.lastIndex()})
  329. } else {
  330. sm.send(Message{To: m.From, Type: msgAppResp, Index: -1})
  331. }
  332. }
  333. func (sm *raft) handleSnapshot(m Message) {
  334. if sm.restore(m.Snapshot) {
  335. sm.send(Message{To: m.From, Type: msgAppResp, Index: sm.raftLog.lastIndex()})
  336. } else {
  337. sm.send(Message{To: m.From, Type: msgAppResp, Index: sm.raftLog.committed})
  338. }
  339. }
  340. func (sm *raft) addNode(id int64) {
  341. sm.addIns(id, 0, sm.raftLog.lastIndex()+1)
  342. sm.pendingConf = false
  343. if id == sm.id {
  344. sm.promotable = true
  345. }
  346. }
  347. func (sm *raft) removeNode(id int64) {
  348. sm.deleteIns(id)
  349. sm.pendingConf = false
  350. }
  351. type stepFunc func(sm *raft, m Message) bool
  352. func stepLeader(sm *raft, m Message) bool {
  353. switch m.Type {
  354. case msgBeat:
  355. sm.bcastHeartbeat()
  356. case msgProp:
  357. if len(m.Entries) != 1 {
  358. panic("unexpected length(entries) of a msgProp")
  359. }
  360. e := m.Entries[0]
  361. if e.isConfig() {
  362. if sm.pendingConf {
  363. return false
  364. }
  365. sm.pendingConf = true
  366. }
  367. sm.appendEntry(e)
  368. sm.bcastAppend()
  369. case msgAppResp:
  370. if m.Index < 0 {
  371. sm.ins[m.From].decr()
  372. sm.sendAppend(m.From)
  373. } else {
  374. sm.ins[m.From].update(m.Index)
  375. if sm.maybeCommit() {
  376. sm.bcastAppend()
  377. }
  378. }
  379. case msgVote:
  380. sm.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  381. }
  382. return true
  383. }
  384. func stepCandidate(sm *raft, m Message) bool {
  385. switch m.Type {
  386. case msgProp:
  387. return false
  388. case msgApp:
  389. sm.becomeFollower(sm.Term, m.From)
  390. sm.handleAppendEntries(m)
  391. case msgSnap:
  392. sm.becomeFollower(m.Term, m.From)
  393. sm.handleSnapshot(m)
  394. case msgVote:
  395. sm.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  396. case msgVoteResp:
  397. gr := sm.poll(m.From, m.Index >= 0)
  398. switch sm.q() {
  399. case gr:
  400. sm.becomeLeader()
  401. sm.bcastAppend()
  402. case len(sm.votes) - gr:
  403. sm.becomeFollower(sm.Term, none)
  404. }
  405. }
  406. return true
  407. }
  408. func stepFollower(sm *raft, m Message) bool {
  409. switch m.Type {
  410. case msgProp:
  411. if sm.lead.Get() == none {
  412. return false
  413. }
  414. m.To = sm.lead.Get()
  415. sm.send(m)
  416. case msgApp:
  417. sm.lead.Set(m.From)
  418. sm.handleAppendEntries(m)
  419. case msgSnap:
  420. sm.handleSnapshot(m)
  421. case msgVote:
  422. if (sm.Vote == none || sm.Vote == m.From) && sm.raftLog.isUpToDate(m.Index, m.LogTerm) {
  423. sm.setVote(m.From)
  424. sm.send(Message{To: m.From, Type: msgVoteResp, Index: sm.raftLog.lastIndex()})
  425. } else {
  426. sm.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  427. }
  428. }
  429. return true
  430. }
  431. func (sm *raft) compact(d []byte) {
  432. sm.raftLog.snap(d, sm.raftLog.applied, sm.raftLog.term(sm.raftLog.applied), sm.nodes())
  433. sm.raftLog.compact(sm.raftLog.applied)
  434. }
  435. // restore recovers the statemachine from a snapshot. It restores the log and the
  436. // configuration of statemachine.
  437. func (sm *raft) restore(s Snapshot) bool {
  438. if s.Index <= sm.raftLog.committed {
  439. return false
  440. }
  441. sm.raftLog.restore(s)
  442. sm.LastIndex = sm.raftLog.lastIndex()
  443. sm.ins = make(map[int64]*index)
  444. for _, n := range s.Nodes {
  445. if n == sm.id {
  446. sm.addIns(n, sm.raftLog.lastIndex(), sm.raftLog.lastIndex()+1)
  447. } else {
  448. sm.addIns(n, 0, sm.raftLog.lastIndex()+1)
  449. }
  450. }
  451. sm.pendingConf = false
  452. return true
  453. }
  454. func (sm *raft) needSnapshot(i int64) bool {
  455. if i < sm.raftLog.offset {
  456. if sm.raftLog.snapshot.IsEmpty() {
  457. panic("need non-empty snapshot")
  458. }
  459. return true
  460. }
  461. return false
  462. }
  463. func (sm *raft) nodes() []int64 {
  464. nodes := make([]int64, 0, len(sm.ins))
  465. for k := range sm.ins {
  466. nodes = append(nodes, k)
  467. }
  468. return nodes
  469. }
  470. func (sm *raft) setTerm(term int64) {
  471. sm.Term = term
  472. sm.saveState()
  473. }
  474. func (sm *raft) setVote(vote int64) {
  475. sm.Vote = vote
  476. sm.saveState()
  477. }
  478. func (sm *raft) addIns(id, match, next int64) {
  479. sm.ins[id] = &index{next: next, match: match}
  480. sm.saveState()
  481. }
  482. func (sm *raft) deleteIns(id int64) {
  483. delete(sm.ins, id)
  484. sm.saveState()
  485. }
  486. // saveState saves the state to sm.unstableState
  487. // When there is a term change, vote change or configuration change, raft
  488. // must call saveState.
  489. func (sm *raft) saveState() {
  490. sm.setState(sm.Vote, sm.Term, sm.raftLog.committed)
  491. }
  492. func (sm *raft) clearState() {
  493. sm.setState(0, 0, 0)
  494. }
  495. func (sm *raft) setState(vote, term, commit int64) {
  496. sm.unstableState.Vote = vote
  497. sm.unstableState.Term = term
  498. sm.unstableState.Commit = commit
  499. }
  500. func (sm *raft) loadEnts(ents []Entry) {
  501. if !sm.raftLog.isEmpty() {
  502. panic("cannot load entries when log is not empty")
  503. }
  504. sm.raftLog.append(0, ents...)
  505. sm.raftLog.unstable = sm.raftLog.lastIndex() + 1
  506. }
  507. func (sm *raft) loadState(state State) {
  508. sm.raftLog.committed = state.Commit
  509. sm.setTerm(state.Term)
  510. sm.setVote(state.Vote)
  511. }
  512. func (s *State) IsEmpty() bool {
  513. return s.Term == 0
  514. }