|
|
@@ -148,12 +148,17 @@ type Config struct {
|
|
|
// applied entries. This is a very application dependent configuration.
|
|
|
Applied uint64
|
|
|
|
|
|
- // MaxSizePerMsg limits the max size of each append message. Smaller value
|
|
|
- // lowers the raft recovery cost(initial probing and message lost during normal
|
|
|
- // operation). On the other side, it might affect the throughput during normal
|
|
|
- // replication. Note: math.MaxUint64 for unlimited, 0 for at most one entry per
|
|
|
- // message.
|
|
|
+ // MaxSizePerMsg limits the max byte size of each append message. Smaller
|
|
|
+ // value lowers the raft recovery cost(initial probing and message lost
|
|
|
+ // during normal operation). On the other side, it might affect the
|
|
|
+ // throughput during normal replication. Note: math.MaxUint64 for unlimited,
|
|
|
+ // 0 for at most one entry per message.
|
|
|
MaxSizePerMsg uint64
|
|
|
+ // MaxUncommittedEntriesSize limits the aggregate byte size of the
|
|
|
+ // uncommitted entries that may be appended to a leader's log. Once this
|
|
|
+ // limit is exceeded, proposals will begin to return ErrProposalDropped
|
|
|
+ // errors. Note: 0 for no limit.
|
|
|
+ MaxUncommittedEntriesSize uint64
|
|
|
// MaxInflightMsgs limits the max number of in-flight append messages during
|
|
|
// optimistic replication phase. The application transportation layer usually
|
|
|
// has its own sending buffer over TCP/UDP. Setting MaxInflightMsgs to avoid
|
|
|
@@ -215,6 +220,10 @@ func (c *Config) validate() error {
|
|
|
return errors.New("storage cannot be nil")
|
|
|
}
|
|
|
|
|
|
+ if c.MaxUncommittedEntriesSize == 0 {
|
|
|
+ c.MaxUncommittedEntriesSize = noLimit
|
|
|
+ }
|
|
|
+
|
|
|
if c.MaxInflightMsgs <= 0 {
|
|
|
return errors.New("max inflight messages must be greater than 0")
|
|
|
}
|
|
|
@@ -241,11 +250,12 @@ type raft struct {
|
|
|
// the log
|
|
|
raftLog *raftLog
|
|
|
|
|
|
- maxInflight int
|
|
|
- maxMsgSize uint64
|
|
|
- prs map[uint64]*Progress
|
|
|
- learnerPrs map[uint64]*Progress
|
|
|
- matchBuf uint64Slice
|
|
|
+ maxMsgSize uint64
|
|
|
+ maxUncommittedSize uint64
|
|
|
+ maxInflight int
|
|
|
+ prs map[uint64]*Progress
|
|
|
+ learnerPrs map[uint64]*Progress
|
|
|
+ matchBuf uint64Slice
|
|
|
|
|
|
state StateType
|
|
|
|
|
|
@@ -268,6 +278,10 @@ type raft struct {
|
|
|
// be proposed if the leader's applied index is greater than this
|
|
|
// value.
|
|
|
pendingConfIndex uint64
|
|
|
+ // an estimate of the size of the uncommitted tail of the Raft log. Used to
|
|
|
+ // prevent unbounded log growth. Only maintained by the leader. Reset on
|
|
|
+ // term changes.
|
|
|
+ uncommittedSize uint64
|
|
|
|
|
|
readOnly *readOnly
|
|
|
|
|
|
@@ -326,6 +340,7 @@ func newRaft(c *Config) *raft {
|
|
|
raftLog: raftlog,
|
|
|
maxMsgSize: c.MaxSizePerMsg,
|
|
|
maxInflight: c.MaxInflightMsgs,
|
|
|
+ maxUncommittedSize: c.MaxUncommittedEntriesSize,
|
|
|
prs: make(map[uint64]*Progress),
|
|
|
learnerPrs: make(map[uint64]*Progress),
|
|
|
electionTimeout: c.ElectionTick,
|
|
|
@@ -514,7 +529,7 @@ func (r *raft) maybeSendAppend(to uint64, sendIfEmpty bool) bool {
|
|
|
return true
|
|
|
}
|
|
|
|
|
|
-// sendHeartbeat sends an empty MsgApp
|
|
|
+// sendHeartbeat sends a heartbeat RPC to the given peer.
|
|
|
func (r *raft) sendHeartbeat(to uint64, ctx []byte) {
|
|
|
// Attach the commit as min(to.matched, r.committed).
|
|
|
// When the leader sends out heartbeat message,
|
|
|
@@ -616,6 +631,7 @@ func (r *raft) reset(term uint64) {
|
|
|
})
|
|
|
|
|
|
r.pendingConfIndex = 0
|
|
|
+ r.uncommittedSize = 0
|
|
|
r.readOnly = newReadOnly(r.readOnly.option)
|
|
|
}
|
|
|
|
|
|
@@ -954,6 +970,10 @@ func stepLeader(r *raft, m pb.Message) error {
|
|
|
r.logger.Debugf("%x [term %d] transfer leadership to %x is in progress; dropping proposal", r.id, r.Term, r.leadTransferee)
|
|
|
return ErrProposalDropped
|
|
|
}
|
|
|
+ if !r.increaseUncommittedSize(m.Entries) {
|
|
|
+ r.logger.Debugf("%x appending new entries to log would exceed uncommitted entry size limit; dropping proposal", r.id)
|
|
|
+ return ErrProposalDropped
|
|
|
+ }
|
|
|
|
|
|
for i, e := range m.Entries {
|
|
|
if e.Type == pb.EntryConfChange {
|
|
|
@@ -1462,6 +1482,49 @@ func (r *raft) abortLeaderTransfer() {
|
|
|
r.leadTransferee = None
|
|
|
}
|
|
|
|
|
|
+// increaseUncommittedSize computes the size of the proposed entries and
|
|
|
+// determines whether they would push leader over its maxUncommittedSize limit.
|
|
|
+// If the new entries would exceed the limit, the method returns false. If not,
|
|
|
+// the increase in uncommitted entry size is recorded and the method returns
|
|
|
+// true.
|
|
|
+func (r *raft) increaseUncommittedSize(ents []pb.Entry) bool {
|
|
|
+ var s uint64
|
|
|
+ for _, e := range ents {
|
|
|
+ s += uint64(e.Size())
|
|
|
+ }
|
|
|
+
|
|
|
+ if r.uncommittedSize > 0 && r.uncommittedSize+s > r.maxUncommittedSize {
|
|
|
+ // If the uncommitted tail of the Raft log is empty, allow any size
|
|
|
+ // proposal. Otherwise, limit the size of the uncommitted tail of the
|
|
|
+ // log and drop any proposal that would push the size over the limit.
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ r.uncommittedSize += s
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+// reduceUncommittedSize accounts for the newly committed entries by decreasing
|
|
|
+// the uncommitted entry size limit.
|
|
|
+func (r *raft) reduceUncommittedSize(ents []pb.Entry) {
|
|
|
+ if r.uncommittedSize == 0 {
|
|
|
+ // Fast-path for followers, who do not track or enforce the limit.
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ var s uint64
|
|
|
+ for _, e := range ents {
|
|
|
+ s += uint64(e.Size())
|
|
|
+ }
|
|
|
+ if s > r.uncommittedSize {
|
|
|
+ // uncommittedSize may underestimate the size of the uncommitted Raft
|
|
|
+ // log tail but will never overestimate it. Saturate at 0 instead of
|
|
|
+ // allowing overflow.
|
|
|
+ r.uncommittedSize = 0
|
|
|
+ } else {
|
|
|
+ r.uncommittedSize -= s
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
func numOfPendingConf(ents []pb.Entry) int {
|
|
|
n := 0
|
|
|
for i := range ents {
|