| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460 |
- // Copyright 2015 The etcd Authors
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- package raft
- import (
- "fmt"
- "sort"
- )
- const (
- ProgressStateProbe ProgressStateType = iota
- ProgressStateReplicate
- ProgressStateSnapshot
- )
- type ProgressStateType uint64
- var prstmap = [...]string{
- "ProgressStateProbe",
- "ProgressStateReplicate",
- "ProgressStateSnapshot",
- }
- func (st ProgressStateType) String() string { return prstmap[uint64(st)] }
- // Progress represents a follower’s progress in the view of the leader. Leader maintains
- // progresses of all followers, and sends entries to the follower based on its progress.
- type Progress struct {
- Match, Next uint64
- // State defines how the leader should interact with the follower.
- //
- // When in ProgressStateProbe, leader sends at most one replication message
- // per heartbeat interval. It also probes actual progress of the follower.
- //
- // When in ProgressStateReplicate, leader optimistically increases next
- // to the latest entry sent after sending replication message. This is
- // an optimized state for fast replicating log entries to the follower.
- //
- // When in ProgressStateSnapshot, leader should have sent out snapshot
- // before and stops sending any replication message.
- State ProgressStateType
- // Paused is used in ProgressStateProbe.
- // When Paused is true, raft should pause sending replication message to this peer.
- Paused bool
- // PendingSnapshot is used in ProgressStateSnapshot.
- // If there is a pending snapshot, the pendingSnapshot will be set to the
- // index of the snapshot. If pendingSnapshot is set, the replication process of
- // this Progress will be paused. raft will not resend snapshot until the pending one
- // is reported to be failed.
- PendingSnapshot uint64
- // RecentActive is true if the progress is recently active. Receiving any messages
- // from the corresponding follower indicates the progress is active.
- // RecentActive can be reset to false after an election timeout.
- RecentActive bool
- // inflights is a sliding window for the inflight messages.
- // Each inflight message contains one or more log entries.
- // The max number of entries per message is defined in raft config as MaxSizePerMsg.
- // Thus inflight effectively limits both the number of inflight messages
- // and the bandwidth each Progress can use.
- // When inflights is full, no more message should be sent.
- // When a leader sends out a message, the index of the last
- // entry should be added to inflights. The index MUST be added
- // into inflights in order.
- // When a leader receives a reply, the previous inflights should
- // be freed by calling inflights.freeTo with the index of the last
- // received entry.
- ins *inflights
- // IsLearner is true if this progress is tracked for a learner.
- IsLearner bool
- }
- func (pr *Progress) resetState(state ProgressStateType) {
- pr.Paused = false
- pr.PendingSnapshot = 0
- pr.State = state
- pr.ins.reset()
- }
- func (pr *Progress) becomeProbe() {
- // If the original state is ProgressStateSnapshot, progress knows that
- // the pending snapshot has been sent to this peer successfully, then
- // probes from pendingSnapshot + 1.
- if pr.State == ProgressStateSnapshot {
- pendingSnapshot := pr.PendingSnapshot
- pr.resetState(ProgressStateProbe)
- pr.Next = max(pr.Match+1, pendingSnapshot+1)
- } else {
- pr.resetState(ProgressStateProbe)
- pr.Next = pr.Match + 1
- }
- }
- func (pr *Progress) becomeReplicate() {
- pr.resetState(ProgressStateReplicate)
- pr.Next = pr.Match + 1
- }
- func (pr *Progress) becomeSnapshot(snapshoti uint64) {
- pr.resetState(ProgressStateSnapshot)
- pr.PendingSnapshot = snapshoti
- }
- // maybeUpdate returns false if the given n index comes from an outdated message.
- // Otherwise it updates the progress and returns true.
- func (pr *Progress) maybeUpdate(n uint64) bool {
- var updated bool
- if pr.Match < n {
- pr.Match = n
- updated = true
- pr.resume()
- }
- if pr.Next < n+1 {
- pr.Next = n + 1
- }
- return updated
- }
- func (pr *Progress) optimisticUpdate(n uint64) { pr.Next = n + 1 }
- // maybeDecrTo returns false if the given to index comes from an out of order message.
- // Otherwise it decreases the progress next index to min(rejected, last) and returns true.
- func (pr *Progress) maybeDecrTo(rejected, last uint64) bool {
- if pr.State == ProgressStateReplicate {
- // the rejection must be stale if the progress has matched and "rejected"
- // is smaller than "match".
- if rejected <= pr.Match {
- return false
- }
- // directly decrease next to match + 1
- pr.Next = pr.Match + 1
- return true
- }
- // the rejection must be stale if "rejected" does not match next - 1
- if pr.Next-1 != rejected {
- return false
- }
- if pr.Next = min(rejected, last+1); pr.Next < 1 {
- pr.Next = 1
- }
- pr.resume()
- return true
- }
- func (pr *Progress) pause() { pr.Paused = true }
- func (pr *Progress) resume() { pr.Paused = false }
- // IsPaused returns whether sending log entries to this node has been
- // paused. A node may be paused because it has rejected recent
- // MsgApps, is currently waiting for a snapshot, or has reached the
- // MaxInflightMsgs limit.
- func (pr *Progress) IsPaused() bool {
- switch pr.State {
- case ProgressStateProbe:
- return pr.Paused
- case ProgressStateReplicate:
- return pr.ins.full()
- case ProgressStateSnapshot:
- return true
- default:
- panic("unexpected state")
- }
- }
- func (pr *Progress) snapshotFailure() { pr.PendingSnapshot = 0 }
- // needSnapshotAbort returns true if snapshot progress's Match
- // is equal or higher than the pendingSnapshot.
- func (pr *Progress) needSnapshotAbort() bool {
- return pr.State == ProgressStateSnapshot && pr.Match >= pr.PendingSnapshot
- }
- func (pr *Progress) String() string {
- return fmt.Sprintf("next = %d, match = %d, state = %s, waiting = %v, pendingSnapshot = %d, recentActive = %v, isLearner = %v",
- pr.Next, pr.Match, pr.State, pr.IsPaused(), pr.PendingSnapshot, pr.RecentActive, pr.IsLearner)
- }
- type inflights struct {
- // the starting index in the buffer
- start int
- // number of inflights in the buffer
- count int
- // the size of the buffer
- size int
- // buffer contains the index of the last entry
- // inside one message.
- buffer []uint64
- }
- func newInflights(size int) *inflights {
- return &inflights{
- size: size,
- }
- }
- // add adds an inflight into inflights
- func (in *inflights) add(inflight uint64) {
- if in.full() {
- panic("cannot add into a full inflights")
- }
- next := in.start + in.count
- size := in.size
- if next >= size {
- next -= size
- }
- if next >= len(in.buffer) {
- in.growBuf()
- }
- in.buffer[next] = inflight
- in.count++
- }
- // grow the inflight buffer by doubling up to inflights.size. We grow on demand
- // instead of preallocating to inflights.size to handle systems which have
- // thousands of Raft groups per process.
- func (in *inflights) growBuf() {
- newSize := len(in.buffer) * 2
- if newSize == 0 {
- newSize = 1
- } else if newSize > in.size {
- newSize = in.size
- }
- newBuffer := make([]uint64, newSize)
- copy(newBuffer, in.buffer)
- in.buffer = newBuffer
- }
- // freeTo frees the inflights smaller or equal to the given `to` flight.
- func (in *inflights) freeTo(to uint64) {
- if in.count == 0 || to < in.buffer[in.start] {
- // out of the left side of the window
- return
- }
- idx := in.start
- var i int
- for i = 0; i < in.count; i++ {
- if to < in.buffer[idx] { // found the first large inflight
- break
- }
- // increase index and maybe rotate
- size := in.size
- if idx++; idx >= size {
- idx -= size
- }
- }
- // free i inflights and set new start index
- in.count -= i
- in.start = idx
- if in.count == 0 {
- // inflights is empty, reset the start index so that we don't grow the
- // buffer unnecessarily.
- in.start = 0
- }
- }
- func (in *inflights) freeFirstOne() { in.freeTo(in.buffer[in.start]) }
- // full returns true if the inflights is full.
- func (in *inflights) full() bool {
- return in.count == in.size
- }
- // resets frees all inflights.
- func (in *inflights) reset() {
- in.count = 0
- in.start = 0
- }
- // progressTracker tracks the currently active configuration and the information
- // known about the nodes and learners in it. In particular, it tracks the match
- // index for each peer which in turn allows reasoning about the committed index.
- type progressTracker struct {
- nodes map[uint64]*Progress
- learners map[uint64]*Progress
- votes map[uint64]bool
- maxInflight int
- matchBuf uint64Slice
- }
- func makePRS(maxInflight int) progressTracker {
- p := progressTracker{
- maxInflight: maxInflight,
- nodes: map[uint64]*Progress{},
- learners: map[uint64]*Progress{},
- votes: map[uint64]bool{},
- }
- return p
- }
- // isSingleton returns true if (and only if) there is only one voting member
- // (i.e. the leader) in the current configuration.
- func (p *progressTracker) isSingleton() bool {
- return len(p.nodes) == 1
- }
- func (p *progressTracker) quorum() int {
- return len(p.nodes)/2 + 1
- }
- func (p *progressTracker) hasQuorum(m map[uint64]struct{}) bool {
- return len(m) >= p.quorum()
- }
- // committed returns the largest log index known to be committed based on what
- // the voting members of the group have acknowledged.
- func (p *progressTracker) committed() uint64 {
- // Preserving matchBuf across calls is an optimization
- // used to avoid allocating a new slice on each call.
- if cap(p.matchBuf) < len(p.nodes) {
- p.matchBuf = make(uint64Slice, len(p.nodes))
- }
- p.matchBuf = p.matchBuf[:len(p.nodes)]
- idx := 0
- for _, pr := range p.nodes {
- p.matchBuf[idx] = pr.Match
- idx++
- }
- sort.Sort(&p.matchBuf)
- return p.matchBuf[len(p.matchBuf)-p.quorum()]
- }
- func (p *progressTracker) removeAny(id uint64) {
- pN := p.nodes[id]
- pL := p.learners[id]
- if pN == nil && pL == nil {
- panic("attempting to remove unknown peer %x")
- } else if pN != nil && pL != nil {
- panic(fmt.Sprintf("peer %x is both voter and learner", id))
- }
- delete(p.nodes, id)
- delete(p.learners, id)
- }
- // initProgress initializes a new progress for the given node or learner. The
- // node may not exist yet in either form or a panic will ensue.
- func (p *progressTracker) initProgress(id, match, next uint64, isLearner bool) {
- if pr := p.nodes[id]; pr != nil {
- panic(fmt.Sprintf("peer %x already tracked as node %v", id, pr))
- }
- if pr := p.learners[id]; pr != nil {
- panic(fmt.Sprintf("peer %x already tracked as learner %v", id, pr))
- }
- if !isLearner {
- p.nodes[id] = &Progress{Next: next, Match: match, ins: newInflights(p.maxInflight)}
- return
- }
- p.learners[id] = &Progress{Next: next, Match: match, ins: newInflights(p.maxInflight), IsLearner: true}
- }
- func (p *progressTracker) getProgress(id uint64) *Progress {
- if pr, ok := p.nodes[id]; ok {
- return pr
- }
- return p.learners[id]
- }
- // visit invokes the supplied closure for all tracked progresses.
- func (p *progressTracker) visit(f func(id uint64, pr *Progress)) {
- for id, pr := range p.nodes {
- f(id, pr)
- }
- for id, pr := range p.learners {
- f(id, pr)
- }
- }
- // checkQuorumActive returns true if the quorum is active from
- // the view of the local raft state machine. Otherwise, it returns
- // false.
- func (p *progressTracker) quorumActive() bool {
- var act int
- p.visit(func(id uint64, pr *Progress) {
- if pr.RecentActive && !pr.IsLearner {
- act++
- }
- })
- return act >= p.quorum()
- }
- func (p *progressTracker) voterNodes() []uint64 {
- nodes := make([]uint64, 0, len(p.nodes))
- for id := range p.nodes {
- nodes = append(nodes, id)
- }
- sort.Sort(uint64Slice(nodes))
- return nodes
- }
- func (p *progressTracker) learnerNodes() []uint64 {
- nodes := make([]uint64, 0, len(p.learners))
- for id := range p.learners {
- nodes = append(nodes, id)
- }
- sort.Sort(uint64Slice(nodes))
- return nodes
- }
- // resetVotes prepares for a new round of vote counting via recordVote.
- func (p *progressTracker) resetVotes() {
- p.votes = map[uint64]bool{}
- }
- // recordVote records that the node with the given id voted for this Raft
- // instance if v == true (and declined it otherwise).
- func (p *progressTracker) recordVote(id uint64, v bool) {
- _, ok := p.votes[id]
- if !ok {
- p.votes[id] = v
- }
- }
- // tallyVotes returns the number of granted and rejected votes, and whether the
- // election outcome is known.
- func (p *progressTracker) tallyVotes() (granted int, rejected int, result electionResult) {
- for _, v := range p.votes {
- if v {
- granted++
- } else {
- rejected++
- }
- }
- q := p.quorum()
- result = electionIndeterminate
- if granted >= q {
- result = electionWon
- } else if rejected >= q {
- result = electionLost
- }
- return granted, rejected, result
- }
|