http.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package rafthttp
  15. import (
  16. "context"
  17. "errors"
  18. "fmt"
  19. "io/ioutil"
  20. "net/http"
  21. "path"
  22. "strings"
  23. "github.com/coreos/etcd/etcdserver/api/snap"
  24. pioutil "github.com/coreos/etcd/pkg/ioutil"
  25. "github.com/coreos/etcd/pkg/types"
  26. "github.com/coreos/etcd/raft/raftpb"
  27. "github.com/coreos/etcd/version"
  28. humanize "github.com/dustin/go-humanize"
  29. "go.uber.org/zap"
  30. )
  31. const (
  32. // connReadLimitByte limits the number of bytes
  33. // a single read can read out.
  34. //
  35. // 64KB should be large enough for not causing
  36. // throughput bottleneck as well as small enough
  37. // for not causing a read timeout.
  38. connReadLimitByte = 64 * 1024
  39. )
  40. var (
  41. RaftPrefix = "/raft"
  42. ProbingPrefix = path.Join(RaftPrefix, "probing")
  43. RaftStreamPrefix = path.Join(RaftPrefix, "stream")
  44. RaftSnapshotPrefix = path.Join(RaftPrefix, "snapshot")
  45. errIncompatibleVersion = errors.New("incompatible version")
  46. errClusterIDMismatch = errors.New("cluster ID mismatch")
  47. )
  48. type peerGetter interface {
  49. Get(id types.ID) Peer
  50. }
  51. type writerToResponse interface {
  52. WriteTo(w http.ResponseWriter)
  53. }
  54. type pipelineHandler struct {
  55. lg *zap.Logger
  56. localID types.ID
  57. tr Transporter
  58. r Raft
  59. cid types.ID
  60. }
  61. // newPipelineHandler returns a handler for handling raft messages
  62. // from pipeline for RaftPrefix.
  63. //
  64. // The handler reads out the raft message from request body,
  65. // and forwards it to the given raft state machine for processing.
  66. func newPipelineHandler(t *Transport, r Raft, cid types.ID) http.Handler {
  67. return &pipelineHandler{
  68. lg: t.Logger,
  69. localID: t.ID,
  70. tr: t,
  71. r: r,
  72. cid: cid,
  73. }
  74. }
  75. func (h *pipelineHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  76. if r.Method != "POST" {
  77. w.Header().Set("Allow", "POST")
  78. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  79. return
  80. }
  81. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  82. if err := checkClusterCompatibilityFromHeader(h.lg, h.localID, r.Header, h.cid); err != nil {
  83. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  84. return
  85. }
  86. addRemoteFromRequest(h.tr, r)
  87. // Limit the data size that could be read from the request body, which ensures that read from
  88. // connection will not time out accidentally due to possible blocking in underlying implementation.
  89. limitedr := pioutil.NewLimitedBufferReader(r.Body, connReadLimitByte)
  90. b, err := ioutil.ReadAll(limitedr)
  91. if err != nil {
  92. if h.lg != nil {
  93. h.lg.Warn(
  94. "failed to read Raft message",
  95. zap.String("local-member-id", h.localID.String()),
  96. zap.Error(err),
  97. )
  98. } else {
  99. plog.Errorf("failed to read raft message (%v)", err)
  100. }
  101. http.Error(w, "error reading raft message", http.StatusBadRequest)
  102. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  103. return
  104. }
  105. var m raftpb.Message
  106. if err := m.Unmarshal(b); err != nil {
  107. if h.lg != nil {
  108. h.lg.Warn(
  109. "failed to unmarshal Raft message",
  110. zap.String("local-member-id", h.localID.String()),
  111. zap.Error(err),
  112. )
  113. } else {
  114. plog.Errorf("failed to unmarshal raft message (%v)", err)
  115. }
  116. http.Error(w, "error unmarshaling raft message", http.StatusBadRequest)
  117. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  118. return
  119. }
  120. receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(len(b)))
  121. if err := h.r.Process(context.TODO(), m); err != nil {
  122. switch v := err.(type) {
  123. case writerToResponse:
  124. v.WriteTo(w)
  125. default:
  126. if h.lg != nil {
  127. h.lg.Warn(
  128. "failed to process Raft message",
  129. zap.String("local-member-id", h.localID.String()),
  130. zap.Error(err),
  131. )
  132. } else {
  133. plog.Warningf("failed to process raft message (%v)", err)
  134. }
  135. http.Error(w, "error processing raft message", http.StatusInternalServerError)
  136. w.(http.Flusher).Flush()
  137. // disconnect the http stream
  138. panic(err)
  139. }
  140. return
  141. }
  142. // Write StatusNoContent header after the message has been processed by
  143. // raft, which facilitates the client to report MsgSnap status.
  144. w.WriteHeader(http.StatusNoContent)
  145. }
  146. type snapshotHandler struct {
  147. lg *zap.Logger
  148. tr Transporter
  149. r Raft
  150. snapshotter *snap.Snapshotter
  151. localID types.ID
  152. cid types.ID
  153. }
  154. func newSnapshotHandler(t *Transport, r Raft, snapshotter *snap.Snapshotter, cid types.ID) http.Handler {
  155. return &snapshotHandler{
  156. lg: t.Logger,
  157. tr: t,
  158. r: r,
  159. snapshotter: snapshotter,
  160. localID: t.ID,
  161. cid: cid,
  162. }
  163. }
  164. // ServeHTTP serves HTTP request to receive and process snapshot message.
  165. //
  166. // If request sender dies without closing underlying TCP connection,
  167. // the handler will keep waiting for the request body until TCP keepalive
  168. // finds out that the connection is broken after several minutes.
  169. // This is acceptable because
  170. // 1. snapshot messages sent through other TCP connections could still be
  171. // received and processed.
  172. // 2. this case should happen rarely, so no further optimization is done.
  173. func (h *snapshotHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  174. if r.Method != "POST" {
  175. w.Header().Set("Allow", "POST")
  176. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  177. return
  178. }
  179. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  180. if err := checkClusterCompatibilityFromHeader(h.lg, h.localID, r.Header, h.cid); err != nil {
  181. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  182. return
  183. }
  184. addRemoteFromRequest(h.tr, r)
  185. dec := &messageDecoder{r: r.Body}
  186. // let snapshots be very large since they can exceed 512MB for large installations
  187. m, err := dec.decodeLimit(uint64(1 << 63))
  188. if err != nil {
  189. msg := fmt.Sprintf("failed to decode raft message (%v)", err)
  190. if h.lg != nil {
  191. h.lg.Warn(
  192. "failed to decode Raft message",
  193. zap.String("local-member-id", h.localID.String()),
  194. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  195. zap.Error(err),
  196. )
  197. } else {
  198. plog.Error(msg)
  199. }
  200. http.Error(w, msg, http.StatusBadRequest)
  201. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  202. return
  203. }
  204. msgSize := m.Size()
  205. receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(msgSize))
  206. if m.Type != raftpb.MsgSnap {
  207. if h.lg != nil {
  208. h.lg.Warn(
  209. "unexpected Raft message type",
  210. zap.String("local-member-id", h.localID.String()),
  211. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  212. zap.String("message-type", m.Type.String()),
  213. )
  214. } else {
  215. plog.Errorf("unexpected raft message type %s on snapshot path", m.Type)
  216. }
  217. http.Error(w, "wrong raft message type", http.StatusBadRequest)
  218. return
  219. }
  220. if h.lg != nil {
  221. h.lg.Info(
  222. "receiving database snapshot",
  223. zap.String("local-member-id", h.localID.String()),
  224. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  225. zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
  226. zap.Int("incoming-snapshot-message-size-bytes", msgSize),
  227. zap.String("incoming-snapshot-message-size", humanize.Bytes(uint64(msgSize))),
  228. )
  229. } else {
  230. plog.Infof("receiving database snapshot [index:%d, from %s] ...", m.Snapshot.Metadata.Index, types.ID(m.From))
  231. }
  232. // save incoming database snapshot.
  233. n, err := h.snapshotter.SaveDBFrom(r.Body, m.Snapshot.Metadata.Index)
  234. if err != nil {
  235. msg := fmt.Sprintf("failed to save KV snapshot (%v)", err)
  236. if h.lg != nil {
  237. h.lg.Warn(
  238. "failed to save incoming database snapshot",
  239. zap.String("local-member-id", h.localID.String()),
  240. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  241. zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
  242. zap.Error(err),
  243. )
  244. } else {
  245. plog.Error(msg)
  246. }
  247. http.Error(w, msg, http.StatusInternalServerError)
  248. return
  249. }
  250. receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(n))
  251. if h.lg != nil {
  252. h.lg.Info(
  253. "received and saved database snapshot",
  254. zap.String("local-member-id", h.localID.String()),
  255. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  256. zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
  257. zap.Int64("incoming-snapshot-size-bytes", n),
  258. zap.String("incoming-snapshot-size", humanize.Bytes(uint64(n))),
  259. )
  260. } else {
  261. plog.Infof("received and saved database snapshot [index: %d, from: %s] successfully", m.Snapshot.Metadata.Index, types.ID(m.From))
  262. }
  263. if err := h.r.Process(context.TODO(), m); err != nil {
  264. switch v := err.(type) {
  265. // Process may return writerToResponse error when doing some
  266. // additional checks before calling raft.Node.Step.
  267. case writerToResponse:
  268. v.WriteTo(w)
  269. default:
  270. msg := fmt.Sprintf("failed to process raft message (%v)", err)
  271. if h.lg != nil {
  272. h.lg.Warn(
  273. "failed to process Raft message",
  274. zap.String("local-member-id", h.localID.String()),
  275. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  276. zap.Error(err),
  277. )
  278. } else {
  279. plog.Error(msg)
  280. }
  281. http.Error(w, msg, http.StatusInternalServerError)
  282. }
  283. return
  284. }
  285. // Write StatusNoContent header after the message has been processed by
  286. // raft, which facilitates the client to report MsgSnap status.
  287. w.WriteHeader(http.StatusNoContent)
  288. }
  289. type streamHandler struct {
  290. lg *zap.Logger
  291. tr *Transport
  292. peerGetter peerGetter
  293. r Raft
  294. id types.ID
  295. cid types.ID
  296. }
  297. func newStreamHandler(t *Transport, pg peerGetter, r Raft, id, cid types.ID) http.Handler {
  298. return &streamHandler{
  299. lg: t.Logger,
  300. tr: t,
  301. peerGetter: pg,
  302. r: r,
  303. id: id,
  304. cid: cid,
  305. }
  306. }
  307. func (h *streamHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  308. if r.Method != "GET" {
  309. w.Header().Set("Allow", "GET")
  310. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  311. return
  312. }
  313. w.Header().Set("X-Server-Version", version.Version)
  314. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  315. if err := checkClusterCompatibilityFromHeader(h.lg, h.tr.ID, r.Header, h.cid); err != nil {
  316. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  317. return
  318. }
  319. var t streamType
  320. switch path.Dir(r.URL.Path) {
  321. case streamTypeMsgAppV2.endpoint():
  322. t = streamTypeMsgAppV2
  323. case streamTypeMessage.endpoint():
  324. t = streamTypeMessage
  325. default:
  326. if h.lg != nil {
  327. h.lg.Debug(
  328. "ignored unexpected streaming request path",
  329. zap.String("local-member-id", h.tr.ID.String()),
  330. zap.String("remote-peer-id-stream-handler", h.id.String()),
  331. zap.String("path", r.URL.Path),
  332. )
  333. } else {
  334. plog.Debugf("ignored unexpected streaming request path %s", r.URL.Path)
  335. }
  336. http.Error(w, "invalid path", http.StatusNotFound)
  337. return
  338. }
  339. fromStr := path.Base(r.URL.Path)
  340. from, err := types.IDFromString(fromStr)
  341. if err != nil {
  342. if h.lg != nil {
  343. h.lg.Warn(
  344. "failed to parse path into ID",
  345. zap.String("local-member-id", h.tr.ID.String()),
  346. zap.String("remote-peer-id-stream-handler", h.id.String()),
  347. zap.String("path", fromStr),
  348. zap.Error(err),
  349. )
  350. } else {
  351. plog.Errorf("failed to parse from %s into ID (%v)", fromStr, err)
  352. }
  353. http.Error(w, "invalid from", http.StatusNotFound)
  354. return
  355. }
  356. if h.r.IsIDRemoved(uint64(from)) {
  357. if h.lg != nil {
  358. h.lg.Warn(
  359. "rejected stream from remote peer because it was removed",
  360. zap.String("local-member-id", h.tr.ID.String()),
  361. zap.String("remote-peer-id-stream-handler", h.id.String()),
  362. zap.String("remote-peer-id-from", from.String()),
  363. )
  364. } else {
  365. plog.Warningf("rejected the stream from peer %s since it was removed", from)
  366. }
  367. http.Error(w, "removed member", http.StatusGone)
  368. return
  369. }
  370. p := h.peerGetter.Get(from)
  371. if p == nil {
  372. // This may happen in following cases:
  373. // 1. user starts a remote peer that belongs to a different cluster
  374. // with the same cluster ID.
  375. // 2. local etcd falls behind of the cluster, and cannot recognize
  376. // the members that joined after its current progress.
  377. if urls := r.Header.Get("X-PeerURLs"); urls != "" {
  378. h.tr.AddRemote(from, strings.Split(urls, ","))
  379. }
  380. if h.lg != nil {
  381. h.lg.Warn(
  382. "failed to find remote peer in cluster",
  383. zap.String("local-member-id", h.tr.ID.String()),
  384. zap.String("remote-peer-id-stream-handler", h.id.String()),
  385. zap.String("remote-peer-id-from", from.String()),
  386. zap.String("cluster-id", h.cid.String()),
  387. )
  388. } else {
  389. plog.Errorf("failed to find member %s in cluster %s", from, h.cid)
  390. }
  391. http.Error(w, "error sender not found", http.StatusNotFound)
  392. return
  393. }
  394. wto := h.id.String()
  395. if gto := r.Header.Get("X-Raft-To"); gto != wto {
  396. if h.lg != nil {
  397. h.lg.Warn(
  398. "ignored streaming request; ID mismatch",
  399. zap.String("local-member-id", h.tr.ID.String()),
  400. zap.String("remote-peer-id-stream-handler", h.id.String()),
  401. zap.String("remote-peer-id-header", gto),
  402. zap.String("remote-peer-id-from", from.String()),
  403. zap.String("cluster-id", h.cid.String()),
  404. )
  405. } else {
  406. plog.Errorf("streaming request ignored (ID mismatch got %s want %s)", gto, wto)
  407. }
  408. http.Error(w, "to field mismatch", http.StatusPreconditionFailed)
  409. return
  410. }
  411. w.WriteHeader(http.StatusOK)
  412. w.(http.Flusher).Flush()
  413. c := newCloseNotifier()
  414. conn := &outgoingConn{
  415. t: t,
  416. Writer: w,
  417. Flusher: w.(http.Flusher),
  418. Closer: c,
  419. localID: h.tr.ID,
  420. peerID: h.id,
  421. }
  422. p.attachOutgoingConn(conn)
  423. <-c.closeNotify()
  424. }
  425. // checkClusterCompatibilityFromHeader checks the cluster compatibility of
  426. // the local member from the given header.
  427. // It checks whether the version of local member is compatible with
  428. // the versions in the header, and whether the cluster ID of local member
  429. // matches the one in the header.
  430. func checkClusterCompatibilityFromHeader(lg *zap.Logger, localID types.ID, header http.Header, cid types.ID) error {
  431. remoteName := header.Get("X-Server-From")
  432. remoteServer := serverVersion(header)
  433. remoteVs := ""
  434. if remoteServer != nil {
  435. remoteVs = remoteServer.String()
  436. }
  437. remoteMinClusterVer := minClusterVersion(header)
  438. remoteMinClusterVs := ""
  439. if remoteMinClusterVer != nil {
  440. remoteMinClusterVs = remoteMinClusterVer.String()
  441. }
  442. localServer, localMinCluster, err := checkVersionCompatibility(remoteName, remoteServer, remoteMinClusterVer)
  443. localVs := ""
  444. if localServer != nil {
  445. localVs = localServer.String()
  446. }
  447. localMinClusterVs := ""
  448. if localMinCluster != nil {
  449. localMinClusterVs = localMinCluster.String()
  450. }
  451. if err != nil {
  452. if lg != nil {
  453. lg.Warn(
  454. "failed to check version compatibility",
  455. zap.String("local-member-id", localID.String()),
  456. zap.String("local-member-cluster-id", cid.String()),
  457. zap.String("local-member-server-version", localVs),
  458. zap.String("local-member-server-minimum-cluster-version", localMinClusterVs),
  459. zap.String("remote-peer-server-name", remoteName),
  460. zap.String("remote-peer-server-version", remoteVs),
  461. zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
  462. zap.Error(err),
  463. )
  464. } else {
  465. plog.Errorf("request version incompatibility (%v)", err)
  466. }
  467. return errIncompatibleVersion
  468. }
  469. if gcid := header.Get("X-Etcd-Cluster-ID"); gcid != cid.String() {
  470. if lg != nil {
  471. lg.Warn(
  472. "request cluster ID mismatch",
  473. zap.String("local-member-id", localID.String()),
  474. zap.String("local-member-cluster-id", cid.String()),
  475. zap.String("local-member-server-version", localVs),
  476. zap.String("local-member-server-minimum-cluster-version", localMinClusterVs),
  477. zap.String("remote-peer-server-name", remoteName),
  478. zap.String("remote-peer-server-version", remoteVs),
  479. zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
  480. zap.String("remote-peer-cluster-id", gcid),
  481. )
  482. } else {
  483. plog.Errorf("request cluster ID mismatch (got %s want %s)", gcid, cid)
  484. }
  485. return errClusterIDMismatch
  486. }
  487. return nil
  488. }
  489. type closeNotifier struct {
  490. done chan struct{}
  491. }
  492. func newCloseNotifier() *closeNotifier {
  493. return &closeNotifier{
  494. done: make(chan struct{}),
  495. }
  496. }
  497. func (n *closeNotifier) Close() error {
  498. close(n.done)
  499. return nil
  500. }
  501. func (n *closeNotifier) closeNotify() <-chan struct{} { return n.done }