| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602 |
- package raft
- import (
- "errors"
- "fmt"
- "math/rand"
- "sort"
- pb "github.com/coreos/etcd/raft/raftpb"
- )
- // None is a placeholder node ID used when there is no leader.
- const None int64 = 0
- type messageType int64
- const (
- msgHup int64 = iota
- msgBeat
- msgProp
- msgApp
- msgAppResp
- msgVote
- msgVoteResp
- msgSnap
- msgDenied
- )
- var mtmap = [...]string{
- "msgHup",
- "msgBeat",
- "msgProp",
- "msgApp",
- "msgAppResp",
- "msgVote",
- "msgVoteResp",
- "msgSnap",
- "msgDenied",
- }
- func (mt messageType) String() string {
- return mtmap[int64(mt)]
- }
- var errNoLeader = errors.New("no leader")
- // Possible values for StateType.
- const (
- StateFollower StateType = iota
- StateCandidate
- StateLeader
- )
- // StateType represents the role of a node in a cluster.
- type StateType int64
- var stmap = [...]string{
- "StateFollower",
- "StateCandidate",
- "StateLeader",
- }
- func (st StateType) String() string {
- return stmap[int64(st)]
- }
- type progress struct {
- match, next int64
- }
- func (pr *progress) update(n int64) {
- pr.match = n
- pr.next = n + 1
- }
- // maybeDecrTo returns false if the given to index comes from an out of order message.
- // Otherwise it decreases the progress next index and returns true.
- func (pr *progress) maybeDecrTo(to int64) bool {
- // the rejection must be stale if the
- // progress has matched with follower
- // or "to" does not match next - 1
- if pr.match != 0 || pr.next-1 != to {
- return false
- }
- if pr.next--; pr.next < 1 {
- pr.next = 1
- }
- return true
- }
- func (pr *progress) String() string {
- return fmt.Sprintf("n=%d m=%d", pr.next, pr.match)
- }
- // int64Slice implements sort interface
- type int64Slice []int64
- func (p int64Slice) Len() int { return len(p) }
- func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] }
- func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
- type raft struct {
- pb.HardState
- id int64
- // the log
- raftLog *raftLog
- prs map[int64]*progress
- state StateType
- votes map[int64]bool
- msgs []pb.Message
- // the leader id
- lead int64
- // New configuration is ignored if there exists unapplied configuration.
- pendingConf bool
- // TODO: need GC and recovery from snapshot
- removed map[int64]bool
- elapsed int // number of ticks since the last msg
- heartbeatTimeout int
- electionTimeout int
- tick func()
- step stepFunc
- }
- func newRaft(id int64, peers []int64, election, heartbeat int) *raft {
- if id == None {
- panic("cannot use none id")
- }
- rand.Seed(id)
- r := &raft{
- id: id,
- lead: None,
- raftLog: newLog(),
- prs: make(map[int64]*progress),
- removed: make(map[int64]bool),
- electionTimeout: election,
- heartbeatTimeout: heartbeat,
- }
- for _, p := range peers {
- r.prs[p] = &progress{}
- }
- r.becomeFollower(0, None)
- return r
- }
- func (r *raft) hasLeader() bool { return r.lead != None }
- func (r *raft) shouldStop() bool { return r.removed[r.id] }
- func (r *raft) softState() *SoftState {
- return &SoftState{Lead: r.lead, RaftState: r.state, Nodes: r.nodes(), ShouldStop: r.shouldStop()}
- }
- func (r *raft) String() string {
- s := fmt.Sprintf(`state=%v term=%d`, r.state, r.Term)
- switch r.state {
- case StateFollower:
- s += fmt.Sprintf(" vote=%v lead=%v", r.Vote, r.lead)
- case StateCandidate:
- s += fmt.Sprintf(` votes="%v"`, r.votes)
- case StateLeader:
- s += fmt.Sprintf(` prs="%v"`, r.prs)
- }
- return s
- }
- func (r *raft) poll(id int64, v bool) (granted int) {
- if _, ok := r.votes[id]; !ok {
- r.votes[id] = v
- }
- for _, vv := range r.votes {
- if vv {
- granted++
- }
- }
- return granted
- }
- // send persists state to stable storage and then sends to its mailbox.
- func (r *raft) send(m pb.Message) {
- m.From = r.id
- // do not attach term to msgProp
- // proposals are a way to forward to the leader and
- // should be treated as local message.
- if m.Type != msgProp {
- m.Term = r.Term
- }
- r.msgs = append(r.msgs, m)
- }
- // sendAppend sends RRPC, with entries to the given peer.
- func (r *raft) sendAppend(to int64) {
- pr := r.prs[to]
- m := pb.Message{}
- m.To = to
- m.Index = pr.next - 1
- if r.needSnapshot(m.Index) {
- m.Type = msgSnap
- m.Snapshot = r.raftLog.snapshot
- } else {
- m.Type = msgApp
- m.LogTerm = r.raftLog.term(pr.next - 1)
- m.Entries = r.raftLog.entries(pr.next)
- m.Commit = r.raftLog.committed
- }
- r.send(m)
- }
- // sendHeartbeat sends an empty msgApp
- func (r *raft) sendHeartbeat(to int64) {
- m := pb.Message{
- To: to,
- Type: msgApp,
- }
- r.send(m)
- }
- // bcastAppend sends RRPC, with entries to all peers that are not up-to-date according to r.mis.
- func (r *raft) bcastAppend() {
- for i := range r.prs {
- if i == r.id {
- continue
- }
- r.sendAppend(i)
- }
- }
- // bcastHeartbeat sends RRPC, without entries to all the peers.
- func (r *raft) bcastHeartbeat() {
- for i := range r.prs {
- if i == r.id {
- continue
- }
- r.sendHeartbeat(i)
- }
- }
- func (r *raft) maybeCommit() bool {
- // TODO(bmizerany): optimize.. Currently naive
- mis := make(int64Slice, 0, len(r.prs))
- for i := range r.prs {
- mis = append(mis, r.prs[i].match)
- }
- sort.Sort(sort.Reverse(mis))
- mci := mis[r.q()-1]
- return r.raftLog.maybeCommit(mci, r.Term)
- }
- func (r *raft) reset(term int64) {
- r.Term = term
- r.lead = None
- r.Vote = None
- r.elapsed = 0
- r.votes = make(map[int64]bool)
- for i := range r.prs {
- r.prs[i] = &progress{next: r.raftLog.lastIndex() + 1}
- if i == r.id {
- r.prs[i].match = r.raftLog.lastIndex()
- }
- }
- r.pendingConf = false
- }
- func (r *raft) q() int {
- return len(r.prs)/2 + 1
- }
- func (r *raft) appendEntry(e pb.Entry) {
- e.Term = r.Term
- e.Index = r.raftLog.lastIndex() + 1
- r.raftLog.append(r.raftLog.lastIndex(), e)
- r.prs[r.id].update(r.raftLog.lastIndex())
- r.maybeCommit()
- }
- // tickElection is ran by followers and candidates after r.electionTimeout.
- func (r *raft) tickElection() {
- if !r.promotable() {
- r.elapsed = 0
- return
- }
- r.elapsed++
- if r.isElectionTimeout() {
- r.elapsed = 0
- r.Step(pb.Message{From: r.id, Type: msgHup})
- }
- }
- // tickHeartbeat is ran by leaders to send a msgBeat after r.heartbeatTimeout.
- func (r *raft) tickHeartbeat() {
- r.elapsed++
- if r.elapsed > r.heartbeatTimeout {
- r.elapsed = 0
- r.Step(pb.Message{From: r.id, Type: msgBeat})
- }
- }
- func (r *raft) becomeFollower(term int64, lead int64) {
- r.step = stepFollower
- r.reset(term)
- r.tick = r.tickElection
- r.lead = lead
- r.state = StateFollower
- }
- func (r *raft) becomeCandidate() {
- // TODO(xiangli) remove the panic when the raft implementation is stable
- if r.state == StateLeader {
- panic("invalid transition [leader -> candidate]")
- }
- r.step = stepCandidate
- r.reset(r.Term + 1)
- r.tick = r.tickElection
- r.Vote = r.id
- r.state = StateCandidate
- }
- func (r *raft) becomeLeader() {
- // TODO(xiangli) remove the panic when the raft implementation is stable
- if r.state == StateFollower {
- panic("invalid transition [follower -> leader]")
- }
- r.step = stepLeader
- r.reset(r.Term)
- r.tick = r.tickHeartbeat
- r.lead = r.id
- r.state = StateLeader
- for _, e := range r.raftLog.entries(r.raftLog.committed + 1) {
- if e.Type != pb.EntryConfChange {
- continue
- }
- if r.pendingConf {
- panic("unexpected double uncommitted config entry")
- }
- r.pendingConf = true
- }
- r.appendEntry(pb.Entry{Data: nil})
- }
- func (r *raft) ReadMessages() []pb.Message {
- msgs := r.msgs
- r.msgs = make([]pb.Message, 0)
- return msgs
- }
- func (r *raft) campaign() {
- r.becomeCandidate()
- if r.q() == r.poll(r.id, true) {
- r.becomeLeader()
- }
- for i := range r.prs {
- if i == r.id {
- continue
- }
- lasti := r.raftLog.lastIndex()
- r.send(pb.Message{To: i, Type: msgVote, Index: lasti, LogTerm: r.raftLog.term(lasti)})
- }
- }
- func (r *raft) Step(m pb.Message) error {
- // TODO(bmizerany): this likely allocs - prevent that.
- defer func() { r.Commit = r.raftLog.committed }()
- if r.removed[m.From] {
- if m.From != r.id {
- r.send(pb.Message{To: m.From, Type: msgDenied})
- }
- // TODO: return an error?
- return nil
- }
- if m.Type == msgDenied {
- r.removed[r.id] = true
- // TODO: return an error?
- return nil
- }
- if m.Type == msgHup {
- r.campaign()
- }
- switch {
- case m.Term == 0:
- // local message
- case m.Term > r.Term:
- lead := m.From
- if m.Type == msgVote {
- lead = None
- }
- r.becomeFollower(m.Term, lead)
- case m.Term < r.Term:
- // ignore
- return nil
- }
- r.step(r, m)
- return nil
- }
- func (r *raft) handleAppendEntries(m pb.Message) {
- if r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...) {
- r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
- } else {
- r.send(pb.Message{To: m.From, Type: msgAppResp, Index: m.Index, Reject: true})
- }
- }
- func (r *raft) handleSnapshot(m pb.Message) {
- if r.restore(m.Snapshot) {
- r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
- } else {
- r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.committed})
- }
- }
- func (r *raft) addNode(id int64) {
- r.setProgress(id, 0, r.raftLog.lastIndex()+1)
- r.pendingConf = false
- }
- func (r *raft) removeNode(id int64) {
- r.delProgress(id)
- r.pendingConf = false
- r.removed[id] = true
- }
- type stepFunc func(r *raft, m pb.Message)
- func stepLeader(r *raft, m pb.Message) {
- switch m.Type {
- case msgBeat:
- r.bcastHeartbeat()
- case msgProp:
- if len(m.Entries) != 1 {
- panic("unexpected length(entries) of a msgProp")
- }
- e := m.Entries[0]
- if e.Type == pb.EntryConfChange {
- if r.pendingConf {
- return
- }
- r.pendingConf = true
- }
- r.appendEntry(e)
- r.bcastAppend()
- case msgAppResp:
- if m.Reject {
- if r.prs[m.From].maybeDecrTo(m.Index) {
- r.sendAppend(m.From)
- }
- } else {
- r.prs[m.From].update(m.Index)
- if r.maybeCommit() {
- r.bcastAppend()
- }
- }
- case msgVote:
- r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
- }
- }
- func stepCandidate(r *raft, m pb.Message) {
- switch m.Type {
- case msgProp:
- panic("no leader")
- case msgApp:
- r.becomeFollower(r.Term, m.From)
- r.handleAppendEntries(m)
- case msgSnap:
- r.becomeFollower(m.Term, m.From)
- r.handleSnapshot(m)
- case msgVote:
- r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
- case msgVoteResp:
- gr := r.poll(m.From, !m.Reject)
- switch r.q() {
- case gr:
- r.becomeLeader()
- r.bcastAppend()
- case len(r.votes) - gr:
- r.becomeFollower(r.Term, None)
- }
- }
- }
- func stepFollower(r *raft, m pb.Message) {
- switch m.Type {
- case msgProp:
- if r.lead == None {
- panic("no leader")
- }
- m.To = r.lead
- r.send(m)
- case msgApp:
- r.elapsed = 0
- r.lead = m.From
- r.handleAppendEntries(m)
- case msgSnap:
- r.elapsed = 0
- r.handleSnapshot(m)
- case msgVote:
- if (r.Vote == None || r.Vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm) {
- r.elapsed = 0
- r.Vote = m.From
- r.send(pb.Message{To: m.From, Type: msgVoteResp})
- } else {
- r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
- }
- }
- }
- func (r *raft) compact(index int64, nodes []int64, d []byte) {
- if index > r.raftLog.applied {
- panic(fmt.Sprintf("raft: compact index (%d) exceeds applied index (%d)", index, r.raftLog.applied))
- }
- r.raftLog.snap(d, index, r.raftLog.term(index), nodes)
- r.raftLog.compact(index)
- }
- // restore recovers the statemachine from a snapshot. It restores the log and the
- // configuration of statemachine.
- func (r *raft) restore(s pb.Snapshot) bool {
- if s.Index <= r.raftLog.committed {
- return false
- }
- r.raftLog.restore(s)
- r.prs = make(map[int64]*progress)
- for _, n := range s.Nodes {
- if n == r.id {
- r.setProgress(n, r.raftLog.lastIndex(), r.raftLog.lastIndex()+1)
- } else {
- r.setProgress(n, 0, r.raftLog.lastIndex()+1)
- }
- }
- return true
- }
- func (r *raft) needSnapshot(i int64) bool {
- if i < r.raftLog.offset {
- if r.raftLog.snapshot.Term == 0 {
- panic("need non-empty snapshot")
- }
- return true
- }
- return false
- }
- func (r *raft) nodes() []int64 {
- nodes := make([]int64, 0, len(r.prs))
- for k := range r.prs {
- nodes = append(nodes, k)
- }
- return nodes
- }
- func (r *raft) setProgress(id, match, next int64) {
- r.prs[id] = &progress{next: next, match: match}
- }
- func (r *raft) delProgress(id int64) {
- delete(r.prs, id)
- }
- // promotable indicates whether state machine can be promoted to leader,
- // which is true when its own id is in progress list.
- func (r *raft) promotable() bool {
- _, ok := r.prs[r.id]
- return ok
- }
- func (r *raft) loadEnts(ents []pb.Entry) {
- r.raftLog.load(ents)
- }
- func (r *raft) loadState(state pb.HardState) {
- r.raftLog.committed = state.Commit
- r.Term = state.Term
- r.Vote = state.Vote
- r.Commit = state.Commit
- }
- // isElectionTimeout returns true if r.elapsed is greater than the
- // randomized election timeout in (electiontimeout, 2 * electiontimeout - 1).
- // Otherwise, it returns false.
- func (r *raft) isElectionTimeout() bool {
- d := r.elapsed - r.electionTimeout
- if d < 0 {
- return false
- }
- return d > rand.Int()%r.electionTimeout
- }
|