raft.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. package raft
  2. import (
  3. "errors"
  4. "fmt"
  5. "math/rand"
  6. "sort"
  7. pb "github.com/coreos/etcd/raft/raftpb"
  8. )
  9. const None int64 = 0
  10. type messageType int64
  11. const (
  12. msgHup int64 = iota
  13. msgBeat
  14. msgProp
  15. msgApp
  16. msgAppResp
  17. msgVote
  18. msgVoteResp
  19. msgSnap
  20. msgDenied
  21. )
  22. var mtmap = [...]string{
  23. msgHup: "msgHup",
  24. msgBeat: "msgBeat",
  25. msgProp: "msgProp",
  26. msgApp: "msgApp",
  27. msgAppResp: "msgAppResp",
  28. msgVote: "msgVote",
  29. msgVoteResp: "msgVoteResp",
  30. msgSnap: "msgSnap",
  31. msgDenied: "msgDenied",
  32. }
  33. func (mt messageType) String() string {
  34. return mtmap[int64(mt)]
  35. }
  36. var errNoLeader = errors.New("no leader")
  37. const (
  38. StateFollower StateType = iota
  39. StateCandidate
  40. StateLeader
  41. )
  42. type StateType int64
  43. var stmap = [...]string{
  44. StateFollower: "StateFollower",
  45. StateCandidate: "StateCandidate",
  46. StateLeader: "StateLeader",
  47. }
  48. func (st StateType) String() string {
  49. return stmap[int64(st)]
  50. }
  51. type progress struct {
  52. match, next int64
  53. }
  54. func (pr *progress) update(n int64) {
  55. pr.match = n
  56. pr.next = n + 1
  57. }
  58. // maybeDecrTo returns false if the given to index comes from an out of order message.
  59. // Otherwise it decreases the progress next index and returns true.
  60. func (pr *progress) maybeDecrTo(to int64) bool {
  61. // the rejection must be stale if the
  62. // progress has matched with follower
  63. // or "to" does not match next - 1
  64. if pr.match != 0 || pr.next-1 != to {
  65. return false
  66. }
  67. if pr.next--; pr.next < 1 {
  68. pr.next = 1
  69. }
  70. return true
  71. }
  72. func (pr *progress) String() string {
  73. return fmt.Sprintf("n=%d m=%d", pr.next, pr.match)
  74. }
  75. // int64Slice implements sort interface
  76. type int64Slice []int64
  77. func (p int64Slice) Len() int { return len(p) }
  78. func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] }
  79. func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  80. type raft struct {
  81. pb.HardState
  82. id int64
  83. // the log
  84. raftLog *raftLog
  85. prs map[int64]*progress
  86. state StateType
  87. votes map[int64]bool
  88. msgs []pb.Message
  89. // the leader id
  90. lead int64
  91. // New configuration is ignored if there exists unapplied configuration.
  92. pendingConf bool
  93. // TODO: need GC and recovery from snapshot
  94. removed map[int64]bool
  95. elapsed int // number of ticks since the last msg
  96. heartbeatTimeout int
  97. electionTimeout int
  98. tick func()
  99. step stepFunc
  100. }
  101. func newRaft(id int64, peers []int64, election, heartbeat int) *raft {
  102. if id == None {
  103. panic("cannot use none id")
  104. }
  105. rand.Seed(id)
  106. r := &raft{
  107. id: id,
  108. lead: None,
  109. raftLog: newLog(),
  110. prs: make(map[int64]*progress),
  111. removed: make(map[int64]bool),
  112. electionTimeout: election,
  113. heartbeatTimeout: heartbeat,
  114. }
  115. for _, p := range peers {
  116. r.prs[p] = &progress{}
  117. }
  118. r.becomeFollower(0, None)
  119. return r
  120. }
  121. func (r *raft) hasLeader() bool { return r.lead != None }
  122. func (r *raft) shouldStop() bool { return r.removed[r.id] }
  123. func (r *raft) softState() *SoftState {
  124. return &SoftState{Lead: r.lead, RaftState: r.state, Nodes: r.nodes(), ShouldStop: r.shouldStop()}
  125. }
  126. func (r *raft) String() string {
  127. s := fmt.Sprintf(`state=%v term=%d`, r.state, r.Term)
  128. switch r.state {
  129. case StateFollower:
  130. s += fmt.Sprintf(" vote=%v lead=%v", r.Vote, r.lead)
  131. case StateCandidate:
  132. s += fmt.Sprintf(` votes="%v"`, r.votes)
  133. case StateLeader:
  134. s += fmt.Sprintf(` prs="%v"`, r.prs)
  135. }
  136. return s
  137. }
  138. func (r *raft) poll(id int64, v bool) (granted int) {
  139. if _, ok := r.votes[id]; !ok {
  140. r.votes[id] = v
  141. }
  142. for _, vv := range r.votes {
  143. if vv {
  144. granted++
  145. }
  146. }
  147. return granted
  148. }
  149. // send persists state to stable storage and then sends to its mailbox.
  150. func (r *raft) send(m pb.Message) {
  151. m.From = r.id
  152. // do not attach term to msgProp
  153. // proposals are a way to forward to the leader and
  154. // should be treated as local message.
  155. if m.Type != msgProp {
  156. m.Term = r.Term
  157. }
  158. r.msgs = append(r.msgs, m)
  159. }
  160. // sendAppend sends RRPC, with entries to the given peer.
  161. func (r *raft) sendAppend(to int64) {
  162. pr := r.prs[to]
  163. m := pb.Message{}
  164. m.To = to
  165. m.Index = pr.next - 1
  166. if r.needSnapshot(m.Index) {
  167. m.Type = msgSnap
  168. m.Snapshot = r.raftLog.snapshot
  169. } else {
  170. m.Type = msgApp
  171. m.LogTerm = r.raftLog.term(pr.next - 1)
  172. m.Entries = r.raftLog.entries(pr.next)
  173. m.Commit = r.raftLog.committed
  174. }
  175. r.send(m)
  176. }
  177. // sendHeartbeat sends an empty msgApp
  178. func (r *raft) sendHeartbeat(to int64) {
  179. m := pb.Message{
  180. To: to,
  181. Type: msgApp,
  182. }
  183. r.send(m)
  184. }
  185. // bcastAppend sends RRPC, with entries to all peers that are not up-to-date according to r.mis.
  186. func (r *raft) bcastAppend() {
  187. for i := range r.prs {
  188. if i == r.id {
  189. continue
  190. }
  191. r.sendAppend(i)
  192. }
  193. }
  194. // bcastHeartbeat sends RRPC, without entries to all the peers.
  195. func (r *raft) bcastHeartbeat() {
  196. for i := range r.prs {
  197. if i == r.id {
  198. continue
  199. }
  200. r.sendHeartbeat(i)
  201. }
  202. }
  203. func (r *raft) maybeCommit() bool {
  204. // TODO(bmizerany): optimize.. Currently naive
  205. mis := make(int64Slice, 0, len(r.prs))
  206. for i := range r.prs {
  207. mis = append(mis, r.prs[i].match)
  208. }
  209. sort.Sort(sort.Reverse(mis))
  210. mci := mis[r.q()-1]
  211. return r.raftLog.maybeCommit(mci, r.Term)
  212. }
  213. func (r *raft) reset(term int64) {
  214. r.Term = term
  215. r.lead = None
  216. r.Vote = None
  217. r.elapsed = 0
  218. r.votes = make(map[int64]bool)
  219. for i := range r.prs {
  220. r.prs[i] = &progress{next: r.raftLog.lastIndex() + 1}
  221. if i == r.id {
  222. r.prs[i].match = r.raftLog.lastIndex()
  223. }
  224. }
  225. r.pendingConf = false
  226. }
  227. func (r *raft) q() int {
  228. return len(r.prs)/2 + 1
  229. }
  230. func (r *raft) appendEntry(e pb.Entry) {
  231. e.Term = r.Term
  232. e.Index = r.raftLog.lastIndex() + 1
  233. r.raftLog.append(r.raftLog.lastIndex(), e)
  234. r.prs[r.id].update(r.raftLog.lastIndex())
  235. r.maybeCommit()
  236. }
  237. // tickElection is ran by followers and candidates after r.electionTimeout.
  238. func (r *raft) tickElection() {
  239. if !r.promotable() {
  240. r.elapsed = 0
  241. return
  242. }
  243. r.elapsed++
  244. if r.isElectionTimeout() {
  245. r.elapsed = 0
  246. r.Step(pb.Message{From: r.id, Type: msgHup})
  247. }
  248. }
  249. // tickHeartbeat is ran by leaders to send a msgBeat after r.heartbeatTimeout.
  250. func (r *raft) tickHeartbeat() {
  251. r.elapsed++
  252. if r.elapsed > r.heartbeatTimeout {
  253. r.elapsed = 0
  254. r.Step(pb.Message{From: r.id, Type: msgBeat})
  255. }
  256. }
  257. func (r *raft) becomeFollower(term int64, lead int64) {
  258. r.step = stepFollower
  259. r.reset(term)
  260. r.tick = r.tickElection
  261. r.lead = lead
  262. r.state = StateFollower
  263. }
  264. func (r *raft) becomeCandidate() {
  265. // TODO(xiangli) remove the panic when the raft implementation is stable
  266. if r.state == StateLeader {
  267. panic("invalid transition [leader -> candidate]")
  268. }
  269. r.step = stepCandidate
  270. r.reset(r.Term + 1)
  271. r.tick = r.tickElection
  272. r.Vote = r.id
  273. r.state = StateCandidate
  274. }
  275. func (r *raft) becomeLeader() {
  276. // TODO(xiangli) remove the panic when the raft implementation is stable
  277. if r.state == StateFollower {
  278. panic("invalid transition [follower -> leader]")
  279. }
  280. r.step = stepLeader
  281. r.reset(r.Term)
  282. r.tick = r.tickHeartbeat
  283. r.lead = r.id
  284. r.state = StateLeader
  285. for _, e := range r.raftLog.entries(r.raftLog.committed + 1) {
  286. if e.Type != pb.EntryConfChange {
  287. continue
  288. }
  289. if r.pendingConf {
  290. panic("unexpected double uncommitted config entry")
  291. }
  292. r.pendingConf = true
  293. }
  294. r.appendEntry(pb.Entry{Data: nil})
  295. }
  296. func (r *raft) ReadMessages() []pb.Message {
  297. msgs := r.msgs
  298. r.msgs = make([]pb.Message, 0)
  299. return msgs
  300. }
  301. func (r *raft) campaign() {
  302. r.becomeCandidate()
  303. if r.q() == r.poll(r.id, true) {
  304. r.becomeLeader()
  305. }
  306. for i := range r.prs {
  307. if i == r.id {
  308. continue
  309. }
  310. lasti := r.raftLog.lastIndex()
  311. r.send(pb.Message{To: i, Type: msgVote, Index: lasti, LogTerm: r.raftLog.term(lasti)})
  312. }
  313. }
  314. func (r *raft) Step(m pb.Message) error {
  315. // TODO(bmizerany): this likely allocs - prevent that.
  316. defer func() { r.Commit = r.raftLog.committed }()
  317. if r.removed[m.From] {
  318. if m.From != r.id {
  319. r.send(pb.Message{To: m.From, Type: msgDenied})
  320. }
  321. // TODO: return an error?
  322. return nil
  323. }
  324. if m.Type == msgDenied {
  325. r.removed[r.id] = true
  326. // TODO: return an error?
  327. return nil
  328. }
  329. if m.Type == msgHup {
  330. r.campaign()
  331. }
  332. switch {
  333. case m.Term == 0:
  334. // local message
  335. case m.Term > r.Term:
  336. lead := m.From
  337. if m.Type == msgVote {
  338. lead = None
  339. }
  340. r.becomeFollower(m.Term, lead)
  341. case m.Term < r.Term:
  342. // ignore
  343. return nil
  344. }
  345. r.step(r, m)
  346. return nil
  347. }
  348. func (r *raft) handleAppendEntries(m pb.Message) {
  349. if r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...) {
  350. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
  351. } else {
  352. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: m.Index, Reject: true})
  353. }
  354. }
  355. func (r *raft) handleSnapshot(m pb.Message) {
  356. if r.restore(m.Snapshot) {
  357. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
  358. } else {
  359. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.committed})
  360. }
  361. }
  362. func (r *raft) addNode(id int64) {
  363. r.setProgress(id, 0, r.raftLog.lastIndex()+1)
  364. r.pendingConf = false
  365. }
  366. func (r *raft) removeNode(id int64) {
  367. r.delProgress(id)
  368. r.pendingConf = false
  369. r.removed[id] = true
  370. }
  371. type stepFunc func(r *raft, m pb.Message)
  372. func stepLeader(r *raft, m pb.Message) {
  373. switch m.Type {
  374. case msgBeat:
  375. r.bcastHeartbeat()
  376. case msgProp:
  377. if len(m.Entries) != 1 {
  378. panic("unexpected length(entries) of a msgProp")
  379. }
  380. e := m.Entries[0]
  381. if e.Type == pb.EntryConfChange {
  382. if r.pendingConf {
  383. return
  384. }
  385. r.pendingConf = true
  386. }
  387. r.appendEntry(e)
  388. r.bcastAppend()
  389. case msgAppResp:
  390. if m.Reject {
  391. if r.prs[m.From].maybeDecrTo(m.Index) {
  392. r.sendAppend(m.From)
  393. }
  394. } else {
  395. r.prs[m.From].update(m.Index)
  396. if r.maybeCommit() {
  397. r.bcastAppend()
  398. }
  399. }
  400. case msgVote:
  401. r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
  402. }
  403. }
  404. func stepCandidate(r *raft, m pb.Message) {
  405. switch m.Type {
  406. case msgProp:
  407. panic("no leader")
  408. case msgApp:
  409. r.becomeFollower(r.Term, m.From)
  410. r.handleAppendEntries(m)
  411. case msgSnap:
  412. r.becomeFollower(m.Term, m.From)
  413. r.handleSnapshot(m)
  414. case msgVote:
  415. r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
  416. case msgVoteResp:
  417. gr := r.poll(m.From, !m.Reject)
  418. switch r.q() {
  419. case gr:
  420. r.becomeLeader()
  421. r.bcastAppend()
  422. case len(r.votes) - gr:
  423. r.becomeFollower(r.Term, None)
  424. }
  425. }
  426. }
  427. func stepFollower(r *raft, m pb.Message) {
  428. switch m.Type {
  429. case msgProp:
  430. if r.lead == None {
  431. panic("no leader")
  432. }
  433. m.To = r.lead
  434. r.send(m)
  435. case msgApp:
  436. r.elapsed = 0
  437. r.lead = m.From
  438. r.handleAppendEntries(m)
  439. case msgSnap:
  440. r.elapsed = 0
  441. r.handleSnapshot(m)
  442. case msgVote:
  443. if (r.Vote == None || r.Vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm) {
  444. r.elapsed = 0
  445. r.Vote = m.From
  446. r.send(pb.Message{To: m.From, Type: msgVoteResp})
  447. } else {
  448. r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
  449. }
  450. }
  451. }
  452. func (r *raft) compact(index int64, nodes []int64, d []byte) {
  453. if index > r.raftLog.applied {
  454. panic(fmt.Sprintf("raft: compact index (%d) exceeds applied index (%d)", index, r.raftLog.applied))
  455. }
  456. r.raftLog.snap(d, index, r.raftLog.term(index), nodes)
  457. r.raftLog.compact(index)
  458. }
  459. // restore recovers the statemachine from a snapshot. It restores the log and the
  460. // configuration of statemachine.
  461. func (r *raft) restore(s pb.Snapshot) bool {
  462. if s.Index <= r.raftLog.committed {
  463. return false
  464. }
  465. r.raftLog.restore(s)
  466. r.prs = make(map[int64]*progress)
  467. for _, n := range s.Nodes {
  468. if n == r.id {
  469. r.setProgress(n, r.raftLog.lastIndex(), r.raftLog.lastIndex()+1)
  470. } else {
  471. r.setProgress(n, 0, r.raftLog.lastIndex()+1)
  472. }
  473. }
  474. return true
  475. }
  476. func (r *raft) needSnapshot(i int64) bool {
  477. if i < r.raftLog.offset {
  478. if r.raftLog.snapshot.Term == 0 {
  479. panic("need non-empty snapshot")
  480. }
  481. return true
  482. }
  483. return false
  484. }
  485. func (r *raft) nodes() []int64 {
  486. nodes := make([]int64, 0, len(r.prs))
  487. for k := range r.prs {
  488. nodes = append(nodes, k)
  489. }
  490. return nodes
  491. }
  492. func (r *raft) setProgress(id, match, next int64) {
  493. r.prs[id] = &progress{next: next, match: match}
  494. }
  495. func (r *raft) delProgress(id int64) {
  496. delete(r.prs, id)
  497. }
  498. // promotable indicates whether state machine can be promoted to leader,
  499. // which is true when its own id is in progress list.
  500. func (r *raft) promotable() bool {
  501. _, ok := r.prs[r.id]
  502. return ok
  503. }
  504. func (r *raft) loadEnts(ents []pb.Entry) {
  505. r.raftLog.load(ents)
  506. }
  507. func (r *raft) loadState(state pb.HardState) {
  508. r.raftLog.committed = state.Commit
  509. r.Term = state.Term
  510. r.Vote = state.Vote
  511. r.Commit = state.Commit
  512. }
  513. // isElectionTimeout returns true if r.elapsed is greater than the
  514. // randomized election timeout in (electiontimeout, 2 * electiontimeout - 1).
  515. // Otherwise, it returns false.
  516. func (r *raft) isElectionTimeout() bool {
  517. d := r.elapsed - r.electionTimeout
  518. if d < 0 {
  519. return false
  520. }
  521. return d > rand.Int()%r.electionTimeout
  522. }