v3_snapshot.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package snapshot
  15. import (
  16. "context"
  17. "crypto/sha256"
  18. "encoding/json"
  19. "fmt"
  20. "hash/crc32"
  21. "io"
  22. "math"
  23. "os"
  24. "path/filepath"
  25. "reflect"
  26. "time"
  27. "github.com/coreos/etcd/clientv3"
  28. "github.com/coreos/etcd/etcdserver"
  29. "github.com/coreos/etcd/etcdserver/etcdserverpb"
  30. "github.com/coreos/etcd/etcdserver/membership"
  31. "github.com/coreos/etcd/lease"
  32. "github.com/coreos/etcd/mvcc"
  33. "github.com/coreos/etcd/mvcc/backend"
  34. "github.com/coreos/etcd/pkg/fileutil"
  35. "github.com/coreos/etcd/pkg/types"
  36. "github.com/coreos/etcd/raft"
  37. "github.com/coreos/etcd/raft/raftpb"
  38. "github.com/coreos/etcd/snap"
  39. "github.com/coreos/etcd/store"
  40. "github.com/coreos/etcd/wal"
  41. "github.com/coreos/etcd/wal/walpb"
  42. bolt "github.com/coreos/bbolt"
  43. "go.uber.org/zap"
  44. )
  45. // Manager defines snapshot methods.
  46. type Manager interface {
  47. // Save fetches snapshot from remote etcd server and saves data
  48. // to target path. If the context "ctx" is canceled or timed out,
  49. // snapshot save stream will error out (e.g. context.Canceled,
  50. // context.DeadlineExceeded). Make sure to specify only one endpoint
  51. // in client configuration. Snapshot API must be requested to a
  52. // selected node, and saved snapshot is the point-in-time state of
  53. // the selected node.
  54. Save(ctx context.Context, cfg clientv3.Config, dbPath string) error
  55. // Status returns the snapshot file information.
  56. Status(dbPath string) (Status, error)
  57. // Restore restores a new etcd data directory from given snapshot
  58. // file. It returns an error if specified data directory already
  59. // exists, to prevent unintended data directory overwrites.
  60. Restore(cfg RestoreConfig) error
  61. }
  62. // NewV3 returns a new snapshot Manager for v3.x snapshot.
  63. func NewV3(lg *zap.Logger) Manager {
  64. if lg == nil {
  65. lg = zap.NewExample()
  66. }
  67. return &v3Manager{lg: lg}
  68. }
  69. type v3Manager struct {
  70. lg *zap.Logger
  71. name string
  72. dbPath string
  73. walDir string
  74. snapDir string
  75. cl *membership.RaftCluster
  76. skipHashCheck bool
  77. }
  78. // Save fetches snapshot from remote etcd server and saves data to target path.
  79. func (s *v3Manager) Save(ctx context.Context, cfg clientv3.Config, dbPath string) error {
  80. if len(cfg.Endpoints) != 1 {
  81. return fmt.Errorf("snapshot must be requested to one selected node, not multiple %v", cfg.Endpoints)
  82. }
  83. cli, err := clientv3.New(cfg)
  84. if err != nil {
  85. return err
  86. }
  87. defer cli.Close()
  88. partpath := dbPath + ".part"
  89. defer os.RemoveAll(partpath)
  90. var f *os.File
  91. f, err = os.Create(partpath)
  92. if err != nil {
  93. return fmt.Errorf("could not open %s (%v)", partpath, err)
  94. }
  95. s.lg.Info(
  96. "created temporary db file",
  97. zap.String("path", partpath),
  98. )
  99. now := time.Now()
  100. var rd io.ReadCloser
  101. rd, err = cli.Snapshot(ctx)
  102. if err != nil {
  103. return err
  104. }
  105. s.lg.Info(
  106. "fetching snapshot",
  107. zap.String("endpoint", cfg.Endpoints[0]),
  108. )
  109. if _, err = io.Copy(f, rd); err != nil {
  110. return err
  111. }
  112. if err = fileutil.Fsync(f); err != nil {
  113. return err
  114. }
  115. if err = f.Close(); err != nil {
  116. return err
  117. }
  118. s.lg.Info(
  119. "fetched snapshot",
  120. zap.String("endpoint", cfg.Endpoints[0]),
  121. zap.Duration("took", time.Since(now)),
  122. )
  123. if err = os.Rename(partpath, dbPath); err != nil {
  124. return fmt.Errorf("could not rename %s to %s (%v)", partpath, dbPath, err)
  125. }
  126. s.lg.Info("saved", zap.String("path", dbPath))
  127. return nil
  128. }
  129. // Status is the snapshot file status.
  130. type Status struct {
  131. Hash uint32 `json:"hash"`
  132. Revision int64 `json:"revision"`
  133. TotalKey int `json:"totalKey"`
  134. TotalSize int64 `json:"totalSize"`
  135. }
  136. // Status returns the snapshot file information.
  137. func (s *v3Manager) Status(dbPath string) (ds Status, err error) {
  138. if _, err = os.Stat(dbPath); err != nil {
  139. return ds, err
  140. }
  141. db, err := bolt.Open(dbPath, 0400, &bolt.Options{ReadOnly: true})
  142. if err != nil {
  143. return ds, err
  144. }
  145. defer db.Close()
  146. h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
  147. if err = db.View(func(tx *bolt.Tx) error {
  148. ds.TotalSize = tx.Size()
  149. c := tx.Cursor()
  150. for next, _ := c.First(); next != nil; next, _ = c.Next() {
  151. b := tx.Bucket(next)
  152. if b == nil {
  153. return fmt.Errorf("cannot get hash of bucket %s", string(next))
  154. }
  155. h.Write(next)
  156. iskeyb := (string(next) == "key")
  157. b.ForEach(func(k, v []byte) error {
  158. h.Write(k)
  159. h.Write(v)
  160. if iskeyb {
  161. rev := bytesToRev(k)
  162. ds.Revision = rev.main
  163. }
  164. ds.TotalKey++
  165. return nil
  166. })
  167. }
  168. return nil
  169. }); err != nil {
  170. return ds, err
  171. }
  172. ds.Hash = h.Sum32()
  173. return ds, nil
  174. }
  175. // RestoreConfig configures snapshot restore operation.
  176. type RestoreConfig struct {
  177. // SnapshotPath is the path of snapshot file to restore from.
  178. SnapshotPath string
  179. // Name is the human-readable name of this member.
  180. Name string
  181. // OutputDataDir is the target data directory to save restored data.
  182. // OutputDataDir should not conflict with existing etcd data directory.
  183. // If OutputDataDir already exists, it will return an error to prevent
  184. // unintended data directory overwrites.
  185. // If empty, defaults to "[Name].etcd" if not given.
  186. OutputDataDir string
  187. // OutputWALDir is the target WAL data directory.
  188. // If empty, defaults to "[OutputDataDir]/member/wal" if not given.
  189. OutputWALDir string
  190. // PeerURLs is a list of member's peer URLs to advertise to the rest of the cluster.
  191. PeerURLs []string
  192. // InitialCluster is the initial cluster configuration for restore bootstrap.
  193. InitialCluster string
  194. // InitialClusterToken is the initial cluster token for etcd cluster during restore bootstrap.
  195. InitialClusterToken string
  196. // SkipHashCheck is "true" to ignore snapshot integrity hash value
  197. // (required if copied from data directory).
  198. SkipHashCheck bool
  199. }
  200. // Restore restores a new etcd data directory from given snapshot file.
  201. func (s *v3Manager) Restore(cfg RestoreConfig) error {
  202. pURLs, err := types.NewURLs(cfg.PeerURLs)
  203. if err != nil {
  204. return err
  205. }
  206. var ics types.URLsMap
  207. ics, err = types.NewURLsMap(cfg.InitialCluster)
  208. if err != nil {
  209. return err
  210. }
  211. srv := etcdserver.ServerConfig{
  212. Name: cfg.Name,
  213. PeerURLs: pURLs,
  214. InitialPeerURLsMap: ics,
  215. InitialClusterToken: cfg.InitialClusterToken,
  216. }
  217. if err = srv.VerifyBootstrap(); err != nil {
  218. return err
  219. }
  220. s.cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, ics)
  221. if err != nil {
  222. return err
  223. }
  224. dataDir := cfg.OutputDataDir
  225. if dataDir == "" {
  226. dataDir = cfg.Name + ".etcd"
  227. }
  228. if fileutil.Exist(dataDir) {
  229. return fmt.Errorf("data-dir %q exists", dataDir)
  230. }
  231. walDir := cfg.OutputWALDir
  232. if walDir == "" {
  233. walDir = filepath.Join(dataDir, "member", "wal")
  234. } else if fileutil.Exist(walDir) {
  235. return fmt.Errorf("wal-dir %q exists", walDir)
  236. }
  237. s.name = cfg.Name
  238. s.dbPath = cfg.SnapshotPath
  239. s.walDir = walDir
  240. s.snapDir = filepath.Join(dataDir, "member", "snap")
  241. s.skipHashCheck = cfg.SkipHashCheck
  242. s.lg.Info(
  243. "restoring snapshot",
  244. zap.String("path", s.dbPath),
  245. zap.String("wal-dir", s.walDir),
  246. zap.String("data-dir", dataDir),
  247. zap.String("snap-dir", s.snapDir),
  248. )
  249. if err = s.saveDB(); err != nil {
  250. return err
  251. }
  252. if err = s.saveWALAndSnap(); err != nil {
  253. return err
  254. }
  255. s.lg.Info(
  256. "restored snapshot",
  257. zap.String("path", s.dbPath),
  258. zap.String("wal-dir", s.walDir),
  259. zap.String("data-dir", dataDir),
  260. zap.String("snap-dir", s.snapDir),
  261. )
  262. return nil
  263. }
  264. // saveDB copies the database snapshot to the snapshot directory
  265. func (s *v3Manager) saveDB() error {
  266. f, ferr := os.OpenFile(s.dbPath, os.O_RDONLY, 0600)
  267. if ferr != nil {
  268. return ferr
  269. }
  270. defer f.Close()
  271. // get snapshot integrity hash
  272. if _, err := f.Seek(-sha256.Size, io.SeekEnd); err != nil {
  273. return err
  274. }
  275. sha := make([]byte, sha256.Size)
  276. if _, err := f.Read(sha); err != nil {
  277. return err
  278. }
  279. if _, err := f.Seek(0, io.SeekStart); err != nil {
  280. return err
  281. }
  282. if err := fileutil.CreateDirAll(s.snapDir); err != nil {
  283. return err
  284. }
  285. dbpath := filepath.Join(s.snapDir, "db")
  286. db, dberr := os.OpenFile(dbpath, os.O_RDWR|os.O_CREATE, 0600)
  287. if dberr != nil {
  288. return dberr
  289. }
  290. if _, err := io.Copy(db, f); err != nil {
  291. return err
  292. }
  293. // truncate away integrity hash, if any.
  294. off, serr := db.Seek(0, io.SeekEnd)
  295. if serr != nil {
  296. return serr
  297. }
  298. hasHash := (off % 512) == sha256.Size
  299. if hasHash {
  300. if err := db.Truncate(off - sha256.Size); err != nil {
  301. return err
  302. }
  303. }
  304. if !hasHash && !s.skipHashCheck {
  305. return fmt.Errorf("snapshot missing hash but --skip-hash-check=false")
  306. }
  307. if hasHash && !s.skipHashCheck {
  308. // check for match
  309. if _, err := db.Seek(0, io.SeekStart); err != nil {
  310. return err
  311. }
  312. h := sha256.New()
  313. if _, err := io.Copy(h, db); err != nil {
  314. return err
  315. }
  316. dbsha := h.Sum(nil)
  317. if !reflect.DeepEqual(sha, dbsha) {
  318. return fmt.Errorf("expected sha256 %v, got %v", sha, dbsha)
  319. }
  320. }
  321. // db hash is OK, can now modify DB so it can be part of a new cluster
  322. db.Close()
  323. commit := len(s.cl.Members())
  324. // update consistentIndex so applies go through on etcdserver despite
  325. // having a new raft instance
  326. be := backend.NewDefaultBackend(dbpath)
  327. // a lessor never timeouts leases
  328. lessor := lease.NewLessor(be, math.MaxInt64)
  329. mvs := mvcc.NewStore(be, lessor, (*initIndex)(&commit))
  330. txn := mvs.Write()
  331. btx := be.BatchTx()
  332. del := func(k, v []byte) error {
  333. txn.DeleteRange(k, nil)
  334. return nil
  335. }
  336. // delete stored members from old cluster since using new members
  337. btx.UnsafeForEach([]byte("members"), del)
  338. // todo: add back new members when we start to deprecate old snap file.
  339. btx.UnsafeForEach([]byte("members_removed"), del)
  340. // trigger write-out of new consistent index
  341. txn.End()
  342. mvs.Commit()
  343. mvs.Close()
  344. be.Close()
  345. return nil
  346. }
  347. // saveWALAndSnap creates a WAL for the initial cluster
  348. func (s *v3Manager) saveWALAndSnap() error {
  349. if err := fileutil.CreateDirAll(s.walDir); err != nil {
  350. return err
  351. }
  352. // add members again to persist them to the store we create.
  353. st := store.New(etcdserver.StoreClusterPrefix, etcdserver.StoreKeysPrefix)
  354. s.cl.SetStore(st)
  355. for _, m := range s.cl.Members() {
  356. s.cl.AddMember(m)
  357. }
  358. m := s.cl.MemberByName(s.name)
  359. md := &etcdserverpb.Metadata{NodeID: uint64(m.ID), ClusterID: uint64(s.cl.ID())}
  360. metadata, merr := md.Marshal()
  361. if merr != nil {
  362. return merr
  363. }
  364. w, walerr := wal.Create(s.walDir, metadata)
  365. if walerr != nil {
  366. return walerr
  367. }
  368. defer w.Close()
  369. peers := make([]raft.Peer, len(s.cl.MemberIDs()))
  370. for i, id := range s.cl.MemberIDs() {
  371. ctx, err := json.Marshal((*s.cl).Member(id))
  372. if err != nil {
  373. return err
  374. }
  375. peers[i] = raft.Peer{ID: uint64(id), Context: ctx}
  376. }
  377. ents := make([]raftpb.Entry, len(peers))
  378. nodeIDs := make([]uint64, len(peers))
  379. for i, p := range peers {
  380. nodeIDs[i] = p.ID
  381. cc := raftpb.ConfChange{
  382. Type: raftpb.ConfChangeAddNode,
  383. NodeID: p.ID,
  384. Context: p.Context,
  385. }
  386. d, err := cc.Marshal()
  387. if err != nil {
  388. return err
  389. }
  390. ents[i] = raftpb.Entry{
  391. Type: raftpb.EntryConfChange,
  392. Term: 1,
  393. Index: uint64(i + 1),
  394. Data: d,
  395. }
  396. }
  397. commit, term := uint64(len(ents)), uint64(1)
  398. if err := w.Save(raftpb.HardState{
  399. Term: term,
  400. Vote: peers[0].ID,
  401. Commit: commit,
  402. }, ents); err != nil {
  403. return err
  404. }
  405. b, berr := st.Save()
  406. if berr != nil {
  407. return berr
  408. }
  409. raftSnap := raftpb.Snapshot{
  410. Data: b,
  411. Metadata: raftpb.SnapshotMetadata{
  412. Index: commit,
  413. Term: term,
  414. ConfState: raftpb.ConfState{
  415. Nodes: nodeIDs,
  416. },
  417. },
  418. }
  419. sn := snap.New(s.snapDir)
  420. if err := sn.SaveSnap(raftSnap); err != nil {
  421. return err
  422. }
  423. return w.SaveSnapshot(walpb.Snapshot{Index: commit, Term: term})
  424. }