raft.go 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. package raft
  2. import (
  3. "errors"
  4. "sort"
  5. "sync/atomic"
  6. )
  7. const none = -1
  8. type messageType int64
  9. const (
  10. msgHup messageType = iota
  11. msgBeat
  12. msgProp
  13. msgApp
  14. msgAppResp
  15. msgVote
  16. msgVoteResp
  17. msgSnap
  18. )
  19. var mtmap = [...]string{
  20. msgHup: "msgHup",
  21. msgBeat: "msgBeat",
  22. msgProp: "msgProp",
  23. msgApp: "msgApp",
  24. msgAppResp: "msgAppResp",
  25. msgVote: "msgVote",
  26. msgVoteResp: "msgVoteResp",
  27. msgSnap: "msgSnap",
  28. }
  29. func (mt messageType) String() string {
  30. return mtmap[int64(mt)]
  31. }
  32. var errNoLeader = errors.New("no leader")
  33. const (
  34. stateFollower stateType = iota
  35. stateCandidate
  36. stateLeader
  37. )
  38. type stateType int64
  39. var stmap = [...]string{
  40. stateFollower: "stateFollower",
  41. stateCandidate: "stateCandidate",
  42. stateLeader: "stateLeader",
  43. }
  44. var stepmap = [...]stepFunc{
  45. stateFollower: stepFollower,
  46. stateCandidate: stepCandidate,
  47. stateLeader: stepLeader,
  48. }
  49. func (st stateType) String() string {
  50. return stmap[int64(st)]
  51. }
  52. type Message struct {
  53. Type messageType
  54. To int64
  55. From int64
  56. Term int64
  57. LogTerm int64
  58. Index int64
  59. PrevTerm int64
  60. Entries []Entry
  61. Commit int64
  62. Snapshot Snapshot
  63. }
  64. type index struct {
  65. match, next int64
  66. }
  67. func (in *index) update(n int64) {
  68. in.match = n
  69. in.next = n + 1
  70. }
  71. func (in *index) decr() {
  72. if in.next--; in.next < 1 {
  73. in.next = 1
  74. }
  75. }
  76. // An AtomicInt is an int64 to be accessed atomically.
  77. type atomicInt int64
  78. func (i *atomicInt) Set(n int64) {
  79. atomic.StoreInt64((*int64)(i), n)
  80. }
  81. func (i *atomicInt) Get() int64 {
  82. return atomic.LoadInt64((*int64)(i))
  83. }
  84. // int64Slice implements sort interface
  85. type int64Slice []int64
  86. func (p int64Slice) Len() int { return len(p) }
  87. func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] }
  88. func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  89. type stateMachine struct {
  90. id int64
  91. // the term we are participating in at any time
  92. term atomicInt
  93. index atomicInt
  94. // who we voted for in term
  95. vote int64
  96. // the log
  97. log *log
  98. ins map[int64]*index
  99. state stateType
  100. votes map[int64]bool
  101. msgs []Message
  102. // the leader id
  103. lead atomicInt
  104. // pending reconfiguration
  105. pendingConf bool
  106. snapshoter Snapshoter
  107. }
  108. func newStateMachine(id int64, peers []int64) *stateMachine {
  109. sm := &stateMachine{id: id, log: newLog(), ins: make(map[int64]*index)}
  110. for _, p := range peers {
  111. sm.ins[p] = &index{}
  112. }
  113. sm.reset(0)
  114. return sm
  115. }
  116. func (sm *stateMachine) setSnapshoter(snapshoter Snapshoter) {
  117. sm.snapshoter = snapshoter
  118. }
  119. func (sm *stateMachine) poll(id int64, v bool) (granted int) {
  120. if _, ok := sm.votes[id]; !ok {
  121. sm.votes[id] = v
  122. }
  123. for _, vv := range sm.votes {
  124. if vv {
  125. granted++
  126. }
  127. }
  128. return granted
  129. }
  130. // send persists state to stable storage and then sends to its mailbox.
  131. func (sm *stateMachine) send(m Message) {
  132. m.From = sm.id
  133. m.Term = sm.term.Get()
  134. sm.msgs = append(sm.msgs, m)
  135. }
  136. // sendAppend sends RRPC, with entries to the given peer.
  137. func (sm *stateMachine) sendAppend(to int64) {
  138. in := sm.ins[to]
  139. m := Message{}
  140. m.To = to
  141. m.Index = in.next - 1
  142. if sm.needSnapshot(m.Index) {
  143. m.Type = msgSnap
  144. m.Snapshot = sm.snapshoter.GetSnap()
  145. } else {
  146. m.Type = msgApp
  147. m.LogTerm = sm.log.term(in.next - 1)
  148. m.Entries = sm.log.entries(in.next)
  149. m.Commit = sm.log.committed
  150. }
  151. sm.send(m)
  152. }
  153. // bcastAppend sends RRPC, with entries to all peers that are not up-to-date according to sm.mis.
  154. func (sm *stateMachine) bcastAppend() {
  155. for i := range sm.ins {
  156. if i == sm.id {
  157. continue
  158. }
  159. sm.sendAppend(i)
  160. }
  161. }
  162. func (sm *stateMachine) maybeCommit() bool {
  163. // TODO(bmizerany): optimize.. Currently naive
  164. mis := make(int64Slice, 0, len(sm.ins))
  165. for i := range sm.ins {
  166. mis = append(mis, sm.ins[i].match)
  167. }
  168. sort.Sort(sort.Reverse(mis))
  169. mci := mis[sm.q()-1]
  170. return sm.log.maybeCommit(mci, sm.term.Get())
  171. }
  172. // nextEnts returns the appliable entries and updates the applied index
  173. func (sm *stateMachine) nextEnts() (ents []Entry) {
  174. return sm.log.nextEnts()
  175. }
  176. func (sm *stateMachine) reset(term int64) {
  177. sm.term.Set(term)
  178. sm.lead.Set(none)
  179. sm.vote = none
  180. sm.votes = make(map[int64]bool)
  181. for i := range sm.ins {
  182. sm.ins[i] = &index{next: sm.log.lastIndex() + 1}
  183. if i == sm.id {
  184. sm.ins[i].match = sm.log.lastIndex()
  185. }
  186. }
  187. }
  188. func (sm *stateMachine) q() int {
  189. return len(sm.ins)/2 + 1
  190. }
  191. func (sm *stateMachine) appendEntry(e Entry) {
  192. e.Term = sm.term.Get()
  193. sm.index.Set(sm.log.append(sm.log.lastIndex(), e))
  194. sm.ins[sm.id].update(sm.log.lastIndex())
  195. sm.maybeCommit()
  196. }
  197. // promotable indicates whether state machine could be promoted.
  198. // New machine has to wait for the first log entry to be committed, or it will
  199. // always start as a one-node cluster.
  200. func (sm *stateMachine) promotable() bool {
  201. return sm.log.committed != 0
  202. }
  203. func (sm *stateMachine) becomeFollower(term int64, lead int64) {
  204. sm.reset(term)
  205. sm.lead.Set(lead)
  206. sm.state = stateFollower
  207. sm.pendingConf = false
  208. }
  209. func (sm *stateMachine) becomeCandidate() {
  210. // TODO(xiangli) remove the panic when the raft implementation is stable
  211. if sm.state == stateLeader {
  212. panic("invalid transition [leader -> candidate]")
  213. }
  214. sm.reset(sm.term.Get() + 1)
  215. sm.vote = sm.id
  216. sm.state = stateCandidate
  217. }
  218. func (sm *stateMachine) becomeLeader() {
  219. // TODO(xiangli) remove the panic when the raft implementation is stable
  220. if sm.state == stateFollower {
  221. panic("invalid transition [follower -> leader]")
  222. }
  223. sm.reset(sm.term.Get())
  224. sm.lead.Set(sm.id)
  225. sm.state = stateLeader
  226. for _, e := range sm.log.entries(sm.log.committed + 1) {
  227. if e.isConfig() {
  228. sm.pendingConf = true
  229. }
  230. }
  231. sm.appendEntry(Entry{Type: Normal, Data: nil})
  232. }
  233. func (sm *stateMachine) Msgs() []Message {
  234. msgs := sm.msgs
  235. sm.msgs = make([]Message, 0)
  236. return msgs
  237. }
  238. func (sm *stateMachine) Step(m Message) (ok bool) {
  239. if m.Type == msgHup {
  240. sm.becomeCandidate()
  241. if sm.q() == sm.poll(sm.id, true) {
  242. sm.becomeLeader()
  243. return true
  244. }
  245. for i := range sm.ins {
  246. if i == sm.id {
  247. continue
  248. }
  249. lasti := sm.log.lastIndex()
  250. sm.send(Message{To: i, Type: msgVote, Index: lasti, LogTerm: sm.log.term(lasti)})
  251. }
  252. return true
  253. }
  254. switch {
  255. case m.Term == 0:
  256. // local message
  257. case m.Term > sm.term.Get():
  258. sm.becomeFollower(m.Term, m.From)
  259. case m.Term < sm.term.Get():
  260. // ignore
  261. return true
  262. }
  263. return stepmap[sm.state](sm, m)
  264. }
  265. func (sm *stateMachine) handleAppendEntries(m Message) {
  266. if sm.log.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...) {
  267. sm.index.Set(sm.log.lastIndex())
  268. sm.send(Message{To: m.From, Type: msgAppResp, Index: sm.log.lastIndex()})
  269. } else {
  270. sm.send(Message{To: m.From, Type: msgAppResp, Index: -1})
  271. }
  272. }
  273. func (sm *stateMachine) handleSnapshot(m Message) {
  274. sm.restore(m.Snapshot)
  275. sm.send(Message{To: m.From, Type: msgAppResp, Index: sm.log.lastIndex()})
  276. }
  277. func (sm *stateMachine) addNode(id int64) {
  278. sm.ins[id] = &index{next: sm.log.lastIndex() + 1}
  279. sm.pendingConf = false
  280. }
  281. func (sm *stateMachine) removeNode(id int64) {
  282. delete(sm.ins, id)
  283. sm.pendingConf = false
  284. }
  285. type stepFunc func(sm *stateMachine, m Message) bool
  286. func stepLeader(sm *stateMachine, m Message) bool {
  287. switch m.Type {
  288. case msgBeat:
  289. sm.bcastAppend()
  290. case msgProp:
  291. if len(m.Entries) != 1 {
  292. panic("unexpected length(entries) of a msgProp")
  293. }
  294. e := m.Entries[0]
  295. if e.isConfig() {
  296. if sm.pendingConf {
  297. return false
  298. }
  299. sm.pendingConf = true
  300. }
  301. sm.appendEntry(e)
  302. sm.bcastAppend()
  303. case msgAppResp:
  304. if m.Index < 0 {
  305. sm.ins[m.From].decr()
  306. sm.sendAppend(m.From)
  307. } else {
  308. sm.ins[m.From].update(m.Index)
  309. if sm.maybeCommit() {
  310. sm.bcastAppend()
  311. }
  312. }
  313. case msgVote:
  314. sm.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  315. }
  316. return true
  317. }
  318. func stepCandidate(sm *stateMachine, m Message) bool {
  319. switch m.Type {
  320. case msgProp:
  321. return false
  322. case msgApp:
  323. sm.becomeFollower(sm.term.Get(), m.From)
  324. sm.handleAppendEntries(m)
  325. case msgSnap:
  326. sm.becomeFollower(m.Term, m.From)
  327. sm.handleSnapshot(m)
  328. case msgVote:
  329. sm.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  330. case msgVoteResp:
  331. gr := sm.poll(m.From, m.Index >= 0)
  332. switch sm.q() {
  333. case gr:
  334. sm.becomeLeader()
  335. sm.bcastAppend()
  336. case len(sm.votes) - gr:
  337. sm.becomeFollower(sm.term.Get(), none)
  338. }
  339. }
  340. return true
  341. }
  342. func stepFollower(sm *stateMachine, m Message) bool {
  343. switch m.Type {
  344. case msgProp:
  345. if sm.lead.Get() == none {
  346. return false
  347. }
  348. m.To = sm.lead.Get()
  349. sm.send(m)
  350. case msgApp:
  351. sm.lead.Set(m.From)
  352. sm.handleAppendEntries(m)
  353. case msgSnap:
  354. sm.handleSnapshot(m)
  355. case msgVote:
  356. if (sm.vote == none || sm.vote == m.From) && sm.log.isUpToDate(m.Index, m.LogTerm) {
  357. sm.vote = m.From
  358. sm.send(Message{To: m.From, Type: msgVoteResp, Index: sm.log.lastIndex()})
  359. } else {
  360. sm.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  361. }
  362. }
  363. return true
  364. }
  365. // maybeCompact tries to compact the log. It calls the snapshoter to take a snapshot and
  366. // then compact the log up-to the index at which the snapshot was taken.
  367. func (sm *stateMachine) maybeCompact() bool {
  368. if sm.snapshoter == nil || !sm.log.shouldCompact() {
  369. return false
  370. }
  371. sm.snapshoter.Snap(sm.log.applied, sm.log.term(sm.log.applied), sm.nodes())
  372. sm.log.compact(sm.log.applied)
  373. return true
  374. }
  375. // restore recovers the statemachine from a snapshot. It restores the log and the
  376. // configuration of statemachine. It calls the snapshoter to restore from the given
  377. // snapshot.
  378. func (sm *stateMachine) restore(s Snapshot) {
  379. if sm.snapshoter == nil {
  380. panic("try to restore from snapshot, but snapshoter is nil")
  381. }
  382. sm.log.restore(s.Index, s.Term)
  383. sm.index.Set(sm.log.lastIndex())
  384. sm.ins = make(map[int64]*index)
  385. for _, n := range s.Nodes {
  386. sm.ins[n] = &index{next: sm.log.lastIndex() + 1}
  387. if n == sm.id {
  388. sm.ins[n].match = sm.log.lastIndex()
  389. }
  390. }
  391. sm.pendingConf = false
  392. sm.snapshoter.Restore(s)
  393. }
  394. func (sm *stateMachine) needSnapshot(i int64) bool {
  395. if i < sm.log.offset {
  396. if sm.snapshoter == nil {
  397. panic("need snapshot but snapshoter is nil")
  398. }
  399. return true
  400. }
  401. return false
  402. }
  403. func (sm *stateMachine) nodes() []int64 {
  404. nodes := make([]int64, 0, len(sm.ins))
  405. for k := range sm.ins {
  406. nodes = append(nodes, k)
  407. }
  408. return nodes
  409. }