snapshot_succeed_via_app_resp.txt 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. # TestSnapshotSucceedViaAppResp regression tests the situation in which a snap-
  2. # shot is sent to a follower at the most recent index (i.e. the snapshot index
  3. # is the leader's last index is the committed index). In that situation, a bug
  4. # in the past left the follower in probing status until the next log entry was
  5. # committed.
  6. #
  7. # See https://github.com/etcd-io/etcd/pull/10308 for additional background.
  8. # Turn off output during the setup of the test.
  9. log-level none
  10. ----
  11. ok
  12. # Start with two nodes, but the config already has a third.
  13. add-nodes 2 voters=(1,2,3) index=10
  14. ----
  15. ok
  16. campaign 1
  17. ----
  18. ok
  19. # Fully replicate everything, including the leader's empty index.
  20. stabilize
  21. ----
  22. ok (quiet)
  23. compact 1 11
  24. ----
  25. ok (quiet)
  26. # Drop inflight messages to n3.
  27. deliver-msgs drop=(3)
  28. ----
  29. ok (quiet)
  30. # Show the Raft log messages from now on.
  31. log-level debug
  32. ----
  33. ok
  34. status 1
  35. ----
  36. 1: StateReplicate match=11 next=12 inactive
  37. 2: StateReplicate match=11 next=12
  38. 3: StateProbe match=0 next=11 paused inactive
  39. # Add the node that will receive a snapshot (it has no state at all, does not
  40. # even have a config).
  41. add-nodes 1
  42. ----
  43. INFO 3 switched to configuration voters=()
  44. INFO 3 became follower at term 0
  45. INFO newRaft 3 [peers: [], term: 0, commit: 0, applied: 0, lastindex: 0, lastterm: 0]
  46. # Time passes on the leader so that it will try the previously missing follower
  47. # again.
  48. tick-heartbeat 1
  49. ----
  50. ok
  51. process-ready 1
  52. ----
  53. Ready MustSync=false:
  54. Messages:
  55. 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11
  56. 1->3 MsgHeartbeat Term:1 Log:0/0
  57. # Iterate until no more work is done by the new peer. It receives the heartbeat
  58. # and responds.
  59. stabilize 3
  60. ----
  61. > 3 receiving messages
  62. 1->3 MsgHeartbeat Term:1 Log:0/0
  63. INFO 3 [term: 0] received a MsgHeartbeat message with higher term from 1 [term: 1]
  64. INFO 3 became follower at term 1
  65. > 3 handling Ready
  66. Ready MustSync=true:
  67. Lead:1 State:StateFollower
  68. HardState Term:1 Commit:0
  69. Messages:
  70. 3->1 MsgHeartbeatResp Term:1 Log:0/0
  71. # The leader in turn will realize that n3 needs a snapshot, which it initiates.
  72. stabilize 1
  73. ----
  74. > 1 receiving messages
  75. 3->1 MsgHeartbeatResp Term:1 Log:0/0
  76. DEBUG 1 [firstindex: 12, commit: 11] sent snapshot[index: 11, term: 1] to 3 [StateProbe match=0 next=11]
  77. DEBUG 1 paused sending replication messages to 3 [StateSnapshot match=0 next=11 paused pendingSnap=11]
  78. > 1 handling Ready
  79. Ready MustSync=false:
  80. Messages:
  81. 1->3 MsgSnap Term:1 Log:0/0 Snapshot: Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false
  82. status 1
  83. ----
  84. 1: StateReplicate match=11 next=12 inactive
  85. 2: StateReplicate match=11 next=12
  86. 3: StateSnapshot match=0 next=11 paused pendingSnap=11
  87. # Follower applies the snapshot. Note how it reacts with a MsgAppResp upon completion.
  88. # The snapshot fully catches the follower up (i.e. there are no more log entries it
  89. # needs to apply after). The bug was that the leader failed to realize that the follower
  90. # was now fully caught up.
  91. stabilize 3
  92. ----
  93. > 3 receiving messages
  94. 1->3 MsgSnap Term:1 Log:0/0 Snapshot: Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false
  95. INFO log [committed=0, applied=0, unstable.offset=1, len(unstable.Entries)=0] starts to restore snapshot [index: 11, term: 1]
  96. INFO 3 switched to configuration voters=(1 2 3)
  97. INFO 3 [commit: 11, lastindex: 11, lastterm: 1] restored snapshot [index: 11, term: 1]
  98. INFO 3 [commit: 11] restored snapshot [index: 11, term: 1]
  99. > 3 handling Ready
  100. Ready MustSync=false:
  101. HardState Term:1 Commit:11
  102. Snapshot Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false
  103. Messages:
  104. 3->1 MsgAppResp Term:1 Log:0/11
  105. # The MsgAppResp lets the leader move the follower back to replicating state.
  106. # Leader sends another MsgAppResp, to communicate the updated commit index.
  107. stabilize 1
  108. ----
  109. > 1 receiving messages
  110. 3->1 MsgAppResp Term:1 Log:0/11
  111. DEBUG 1 recovered from needing snapshot, resumed sending replication messages to 3 [StateSnapshot match=11 next=12 paused pendingSnap=11]
  112. > 1 handling Ready
  113. Ready MustSync=false:
  114. Messages:
  115. 1->3 MsgApp Term:1 Log:1/11 Commit:11
  116. status 1
  117. ----
  118. 1: StateReplicate match=11 next=12 inactive
  119. 2: StateReplicate match=11 next=12
  120. 3: StateReplicate match=11 next=12
  121. # Let things settle.
  122. stabilize
  123. ----
  124. > 2 receiving messages
  125. 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11
  126. > 3 receiving messages
  127. 1->3 MsgApp Term:1 Log:1/11 Commit:11
  128. > 2 handling Ready
  129. Ready MustSync=false:
  130. Messages:
  131. 2->1 MsgHeartbeatResp Term:1 Log:0/0
  132. > 3 handling Ready
  133. Ready MustSync=false:
  134. Messages:
  135. 3->1 MsgAppResp Term:1 Log:0/11
  136. > 1 receiving messages
  137. 2->1 MsgHeartbeatResp Term:1 Log:0/0
  138. 3->1 MsgAppResp Term:1 Log:0/11