raft.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. package raft
  2. import (
  3. "errors"
  4. "fmt"
  5. "sort"
  6. )
  7. const none = -1
  8. type messageType int64
  9. const (
  10. msgHup messageType = iota
  11. msgBeat
  12. msgProp
  13. msgApp
  14. msgAppResp
  15. msgVote
  16. msgVoteResp
  17. msgSnap
  18. msgDenied
  19. )
  20. var mtmap = [...]string{
  21. msgHup: "msgHup",
  22. msgBeat: "msgBeat",
  23. msgProp: "msgProp",
  24. msgApp: "msgApp",
  25. msgAppResp: "msgAppResp",
  26. msgVote: "msgVote",
  27. msgVoteResp: "msgVoteResp",
  28. msgSnap: "msgSnap",
  29. msgDenied: "msgDenied",
  30. }
  31. func (mt messageType) String() string {
  32. return mtmap[int64(mt)]
  33. }
  34. var errNoLeader = errors.New("no leader")
  35. const (
  36. stateFollower stateType = iota
  37. stateCandidate
  38. stateLeader
  39. )
  40. type stateType int64
  41. var stmap = [...]string{
  42. stateFollower: "stateFollower",
  43. stateCandidate: "stateCandidate",
  44. stateLeader: "stateLeader",
  45. }
  46. var stepmap = [...]stepFunc{
  47. stateFollower: stepFollower,
  48. stateCandidate: stepCandidate,
  49. stateLeader: stepLeader,
  50. }
  51. func (st stateType) String() string {
  52. return stmap[int64(st)]
  53. }
  54. type Message struct {
  55. Type messageType
  56. To int64
  57. From int64
  58. Term int64
  59. LogTerm int64
  60. Index int64
  61. Entries []Entry
  62. Commit int64
  63. Snapshot Snapshot
  64. }
  65. func (m Message) String() string {
  66. return fmt.Sprintf("type=%v from=%x to=%x term=%d logTerm=%d i=%d ci=%d len(ents)=%d",
  67. m.Type, m.From, m.To, m.Term, m.LogTerm, m.Index, m.Commit, len(m.Entries))
  68. }
  69. type progress struct {
  70. match, next int64
  71. }
  72. func (pr *progress) update(n int64) {
  73. pr.match = n
  74. pr.next = n + 1
  75. }
  76. func (pr *progress) decr() {
  77. if pr.next--; pr.next < 1 {
  78. pr.next = 1
  79. }
  80. }
  81. func (pr *progress) String() string {
  82. return fmt.Sprintf("n=%d m=%d", pr.next, pr.match)
  83. }
  84. // int64Slice implements sort interface
  85. type int64Slice []int64
  86. func (p int64Slice) Len() int { return len(p) }
  87. func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] }
  88. func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  89. type raft struct {
  90. State
  91. id int64
  92. // the term we are participating in at any time
  93. index int64
  94. // the log
  95. raftLog *raftLog
  96. prs map[int64]*progress
  97. state stateType
  98. votes map[int64]bool
  99. msgs []Message
  100. // the leader id
  101. lead int64
  102. // pending reconfiguration
  103. configuring bool
  104. // promotable indicates whether state machine could be promoted.
  105. // New machine has to wait until it has been added to the cluster, or it
  106. // may become the leader of the cluster without it.
  107. promotable bool
  108. }
  109. func newRaft(id int64, peers []int64) *raft {
  110. if id == none {
  111. panic("cannot use none id")
  112. }
  113. r := &raft{id: id, lead: none, raftLog: newLog(), prs: make(map[int64]*progress)}
  114. for _, p := range peers {
  115. r.prs[p] = &progress{}
  116. }
  117. r.reset(0)
  118. return r
  119. }
  120. func (r *raft) hasLeader() bool { return r.lead != none }
  121. func (r *raft) propose(data []byte) {
  122. r.Step(Message{From: r.id, Type: msgProp, Entries: []Entry{{Data: data}}})
  123. }
  124. func (r *raft) String() string {
  125. s := fmt.Sprintf(`state=%v term=%d`, r.state, r.Term)
  126. switch r.state {
  127. case stateFollower:
  128. s += fmt.Sprintf(" vote=%v lead=%v", r.Vote, r.lead)
  129. case stateCandidate:
  130. s += fmt.Sprintf(` votes="%v"`, r.votes)
  131. case stateLeader:
  132. s += fmt.Sprintf(` prs="%v"`, r.prs)
  133. }
  134. return s
  135. }
  136. func (r *raft) poll(id int64, v bool) (granted int) {
  137. if _, ok := r.votes[id]; !ok {
  138. r.votes[id] = v
  139. }
  140. for _, vv := range r.votes {
  141. if vv {
  142. granted++
  143. }
  144. }
  145. return granted
  146. }
  147. // send persists state to stable storage and then sends to its mailbox.
  148. func (r *raft) send(m Message) {
  149. m.From = r.id
  150. m.Term = r.Term
  151. r.msgs = append(r.msgs, m)
  152. }
  153. // sendAppend sends RRPC, with entries to the given peer.
  154. func (r *raft) sendAppend(to int64) {
  155. pr := r.prs[to]
  156. m := Message{}
  157. m.To = to
  158. m.Index = pr.next - 1
  159. if r.needSnapshot(m.Index) {
  160. m.Type = msgSnap
  161. m.Snapshot = r.raftLog.snapshot
  162. } else {
  163. m.Type = msgApp
  164. m.LogTerm = r.raftLog.term(pr.next - 1)
  165. m.Entries = r.raftLog.entries(pr.next)
  166. m.Commit = r.raftLog.committed
  167. }
  168. r.send(m)
  169. }
  170. // sendHeartbeat sends RRPC, without entries to the given peer.
  171. func (r *raft) sendHeartbeat(to int64) {
  172. pr := r.prs[to]
  173. index := max(pr.next-1, r.raftLog.lastIndex())
  174. m := Message{
  175. To: to,
  176. Type: msgApp,
  177. Index: index,
  178. LogTerm: r.raftLog.term(index),
  179. Commit: r.raftLog.committed,
  180. }
  181. r.send(m)
  182. }
  183. // bcastAppend sends RRPC, with entries to all peers that are not up-to-date according to r.mis.
  184. func (r *raft) bcastAppend() {
  185. for i := range r.prs {
  186. if i == r.id {
  187. continue
  188. }
  189. r.sendAppend(i)
  190. }
  191. }
  192. // bcastHeartbeat sends RRPC, without entries to all the peers.
  193. func (r *raft) bcastHeartbeat() {
  194. for i := range r.prs {
  195. if i == r.id {
  196. continue
  197. }
  198. r.sendHeartbeat(i)
  199. }
  200. }
  201. func (r *raft) maybeCommit() bool {
  202. // TODO(bmizerany): optimize.. Currently naive
  203. mis := make(int64Slice, 0, len(r.prs))
  204. for i := range r.prs {
  205. mis = append(mis, r.prs[i].match)
  206. }
  207. sort.Sort(sort.Reverse(mis))
  208. mci := mis[r.q()-1]
  209. return r.raftLog.maybeCommit(mci, r.Term)
  210. }
  211. func (r *raft) reset(term int64) {
  212. r.Term = term
  213. r.lead = none
  214. r.Vote = none
  215. r.votes = make(map[int64]bool)
  216. for i := range r.prs {
  217. r.prs[i] = &progress{next: r.raftLog.lastIndex() + 1}
  218. if i == r.id {
  219. r.prs[i].match = r.raftLog.lastIndex()
  220. }
  221. }
  222. }
  223. func (r *raft) q() int {
  224. return len(r.prs)/2 + 1
  225. }
  226. func (r *raft) appendEntry(e Entry) {
  227. e.Term = r.Term
  228. e.Index = r.raftLog.lastIndex() + 1
  229. r.LastIndex = r.raftLog.append(r.raftLog.lastIndex(), e)
  230. r.prs[r.id].update(r.raftLog.lastIndex())
  231. r.maybeCommit()
  232. }
  233. func (r *raft) becomeFollower(term int64, lead int64) {
  234. r.reset(term)
  235. r.lead = lead
  236. r.state = stateFollower
  237. r.configuring = false
  238. }
  239. func (r *raft) becomeCandidate() {
  240. // TODO(xiangli) remove the panic when the raft implementation is stable
  241. if r.state == stateLeader {
  242. panic("invalid transition [leader -> candidate]")
  243. }
  244. r.reset(r.Term + 1)
  245. r.Vote = r.id
  246. r.state = stateCandidate
  247. }
  248. func (r *raft) becomeLeader() {
  249. // TODO(xiangli) remove the panic when the raft implementation is stable
  250. if r.state == stateFollower {
  251. panic("invalid transition [follower -> leader]")
  252. }
  253. r.reset(r.Term)
  254. r.lead = r.id
  255. r.state = stateLeader
  256. for _, e := range r.raftLog.entries(r.raftLog.committed + 1) {
  257. if e.isConfig() {
  258. r.configuring = true
  259. }
  260. }
  261. r.appendEntry(Entry{Type: Normal, Data: nil})
  262. }
  263. func (r *raft) ReadMessages() []Message {
  264. msgs := r.msgs
  265. r.msgs = make([]Message, 0)
  266. return msgs
  267. }
  268. func (r *raft) Step(m Message) error {
  269. // TODO(bmizerany): this likely allocs - prevent that.
  270. defer func() { r.Commit = r.raftLog.committed }()
  271. if m.Type == msgHup {
  272. r.becomeCandidate()
  273. if r.q() == r.poll(r.id, true) {
  274. r.becomeLeader()
  275. }
  276. for i := range r.prs {
  277. if i == r.id {
  278. continue
  279. }
  280. lasti := r.raftLog.lastIndex()
  281. r.send(Message{To: i, Type: msgVote, Index: lasti, LogTerm: r.raftLog.term(lasti)})
  282. }
  283. }
  284. switch {
  285. case m.Term == 0:
  286. // local message
  287. case m.Term > r.Term:
  288. lead := m.From
  289. if m.Type == msgVote {
  290. lead = none
  291. }
  292. r.becomeFollower(m.Term, lead)
  293. case m.Term < r.Term:
  294. // ignore
  295. }
  296. stepmap[r.state](r, m)
  297. return nil
  298. }
  299. func (r *raft) handleAppendEntries(m Message) {
  300. if r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...) {
  301. r.LastIndex = r.raftLog.lastIndex()
  302. r.send(Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
  303. } else {
  304. r.send(Message{To: m.From, Type: msgAppResp, Index: -1})
  305. }
  306. }
  307. func (r *raft) handleSnapshot(m Message) {
  308. if r.restore(m.Snapshot) {
  309. r.send(Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
  310. } else {
  311. r.send(Message{To: m.From, Type: msgAppResp, Index: r.raftLog.committed})
  312. }
  313. }
  314. func (r *raft) addNode(id int64) {
  315. r.setProgress(id, 0, r.raftLog.lastIndex()+1)
  316. r.configuring = false
  317. if id == r.id {
  318. r.promotable = true
  319. }
  320. }
  321. func (r *raft) removeNode(id int64) {
  322. r.delProgress(id)
  323. r.configuring = false
  324. }
  325. type stepFunc func(r *raft, m Message)
  326. func stepLeader(r *raft, m Message) {
  327. switch m.Type {
  328. case msgBeat:
  329. r.bcastHeartbeat()
  330. case msgProp:
  331. if len(m.Entries) != 1 {
  332. panic("unexpected length(entries) of a msgProp")
  333. }
  334. e := m.Entries[0]
  335. if e.isConfig() {
  336. if r.configuring {
  337. panic("pending conf")
  338. }
  339. r.configuring = true
  340. }
  341. r.appendEntry(e)
  342. r.bcastAppend()
  343. case msgAppResp:
  344. if m.Index < 0 {
  345. r.prs[m.From].decr()
  346. r.sendAppend(m.From)
  347. } else {
  348. r.prs[m.From].update(m.Index)
  349. if r.maybeCommit() {
  350. r.bcastAppend()
  351. }
  352. }
  353. case msgVote:
  354. r.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  355. }
  356. }
  357. func stepCandidate(r *raft, m Message) {
  358. switch m.Type {
  359. case msgProp:
  360. panic("no leader")
  361. case msgApp:
  362. r.becomeFollower(r.Term, m.From)
  363. r.handleAppendEntries(m)
  364. case msgSnap:
  365. r.becomeFollower(m.Term, m.From)
  366. r.handleSnapshot(m)
  367. case msgVote:
  368. r.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  369. case msgVoteResp:
  370. gr := r.poll(m.From, m.Index >= 0)
  371. switch r.q() {
  372. case gr:
  373. r.becomeLeader()
  374. r.bcastAppend()
  375. case len(r.votes) - gr:
  376. r.becomeFollower(r.Term, none)
  377. }
  378. }
  379. }
  380. func stepFollower(r *raft, m Message) {
  381. switch m.Type {
  382. case msgProp:
  383. if r.lead == none {
  384. panic("no leader")
  385. }
  386. m.To = r.lead
  387. r.send(m)
  388. case msgApp:
  389. r.lead = m.From
  390. r.handleAppendEntries(m)
  391. case msgSnap:
  392. r.handleSnapshot(m)
  393. case msgVote:
  394. if (r.Vote == none || r.Vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm) {
  395. r.Vote = m.From
  396. r.send(Message{To: m.From, Type: msgVoteResp, Index: r.raftLog.lastIndex()})
  397. } else {
  398. r.send(Message{To: m.From, Type: msgVoteResp, Index: -1})
  399. }
  400. }
  401. }
  402. func (r *raft) compact(d []byte) {
  403. r.raftLog.snap(d, r.raftLog.applied, r.raftLog.term(r.raftLog.applied), r.nodes())
  404. r.raftLog.compact(r.raftLog.applied)
  405. }
  406. // restore recovers the statemachine from a snapshot. It restores the log and the
  407. // configuration of statemachine.
  408. func (r *raft) restore(s Snapshot) bool {
  409. if s.Index <= r.raftLog.committed {
  410. return false
  411. }
  412. r.raftLog.restore(s)
  413. r.LastIndex = r.raftLog.lastIndex()
  414. r.prs = make(map[int64]*progress)
  415. for _, n := range s.Nodes {
  416. if n == r.id {
  417. r.setProgress(n, r.raftLog.lastIndex(), r.raftLog.lastIndex()+1)
  418. } else {
  419. r.setProgress(n, 0, r.raftLog.lastIndex()+1)
  420. }
  421. }
  422. r.configuring = false
  423. return true
  424. }
  425. func (r *raft) needSnapshot(i int64) bool {
  426. if i < r.raftLog.offset {
  427. if r.raftLog.snapshot.IsEmpty() {
  428. panic("need non-empty snapshot")
  429. }
  430. return true
  431. }
  432. return false
  433. }
  434. func (r *raft) nodes() []int64 {
  435. nodes := make([]int64, 0, len(r.prs))
  436. for k := range r.prs {
  437. nodes = append(nodes, k)
  438. }
  439. return nodes
  440. }
  441. func (r *raft) setProgress(id, match, next int64) {
  442. r.prs[id] = &progress{next: next, match: match}
  443. }
  444. func (r *raft) delProgress(id int64) {
  445. delete(r.prs, id)
  446. }
  447. func (r *raft) loadEnts(ents []Entry) {
  448. if !r.raftLog.isEmpty() {
  449. panic("cannot load entries when log is not empty")
  450. }
  451. r.raftLog.append(0, ents...)
  452. r.raftLog.unstable = r.raftLog.lastIndex() + 1
  453. }
  454. func (r *raft) loadState(state State) {
  455. r.raftLog.committed = state.Commit
  456. r.Term = state.Term
  457. r.Vote = state.Vote
  458. }
  459. func (s *State) IsEmpty() bool {
  460. return s.Term == 0
  461. }