v3_snapshot.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package snapshot
  15. import (
  16. "context"
  17. "crypto/sha256"
  18. "encoding/json"
  19. "fmt"
  20. "hash/crc32"
  21. "io"
  22. "math"
  23. "os"
  24. "path/filepath"
  25. "reflect"
  26. "strings"
  27. "time"
  28. bolt "go.etcd.io/bbolt"
  29. "go.etcd.io/etcd/clientv3"
  30. "go.etcd.io/etcd/etcdserver"
  31. "go.etcd.io/etcd/etcdserver/api/membership"
  32. "go.etcd.io/etcd/etcdserver/api/snap"
  33. "go.etcd.io/etcd/etcdserver/api/v2store"
  34. "go.etcd.io/etcd/etcdserver/etcdserverpb"
  35. "go.etcd.io/etcd/lease"
  36. "go.etcd.io/etcd/mvcc"
  37. "go.etcd.io/etcd/mvcc/backend"
  38. "go.etcd.io/etcd/pkg/fileutil"
  39. "go.etcd.io/etcd/pkg/types"
  40. "go.etcd.io/etcd/raft"
  41. "go.etcd.io/etcd/raft/raftpb"
  42. "go.etcd.io/etcd/wal"
  43. "go.etcd.io/etcd/wal/walpb"
  44. "go.uber.org/zap"
  45. )
  46. // Manager defines snapshot methods.
  47. type Manager interface {
  48. // Save fetches snapshot from remote etcd server and saves data
  49. // to target path. If the context "ctx" is canceled or timed out,
  50. // snapshot save stream will error out (e.g. context.Canceled,
  51. // context.DeadlineExceeded). Make sure to specify only one endpoint
  52. // in client configuration. Snapshot API must be requested to a
  53. // selected node, and saved snapshot is the point-in-time state of
  54. // the selected node.
  55. Save(ctx context.Context, cfg clientv3.Config, dbPath string) error
  56. // Status returns the snapshot file information.
  57. Status(dbPath string) (Status, error)
  58. // Restore restores a new etcd data directory from given snapshot
  59. // file. It returns an error if specified data directory already
  60. // exists, to prevent unintended data directory overwrites.
  61. Restore(cfg RestoreConfig) error
  62. }
  63. // NewV3 returns a new snapshot Manager for v3.x snapshot.
  64. func NewV3(lg *zap.Logger) Manager {
  65. if lg == nil {
  66. lg = zap.NewExample()
  67. }
  68. return &v3Manager{lg: lg}
  69. }
  70. type v3Manager struct {
  71. lg *zap.Logger
  72. name string
  73. dbPath string
  74. walDir string
  75. snapDir string
  76. cl *membership.RaftCluster
  77. skipHashCheck bool
  78. }
  79. // Save fetches snapshot from remote etcd server and saves data to target path.
  80. func (s *v3Manager) Save(ctx context.Context, cfg clientv3.Config, dbPath string) error {
  81. if len(cfg.Endpoints) != 1 {
  82. return fmt.Errorf("snapshot must be requested to one selected node, not multiple %v", cfg.Endpoints)
  83. }
  84. cli, err := clientv3.New(cfg)
  85. if err != nil {
  86. return err
  87. }
  88. defer cli.Close()
  89. partpath := dbPath + ".part"
  90. defer os.RemoveAll(partpath)
  91. var f *os.File
  92. f, err = os.OpenFile(partpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, fileutil.PrivateFileMode)
  93. if err != nil {
  94. return fmt.Errorf("could not open %s (%v)", partpath, err)
  95. }
  96. s.lg.Info(
  97. "created temporary db file",
  98. zap.String("path", partpath),
  99. )
  100. now := time.Now()
  101. var rd io.ReadCloser
  102. rd, err = cli.Snapshot(ctx)
  103. if err != nil {
  104. return err
  105. }
  106. s.lg.Info(
  107. "fetching snapshot",
  108. zap.String("endpoint", cfg.Endpoints[0]),
  109. )
  110. if _, err = io.Copy(f, rd); err != nil {
  111. return err
  112. }
  113. if err = fileutil.Fsync(f); err != nil {
  114. return err
  115. }
  116. if err = f.Close(); err != nil {
  117. return err
  118. }
  119. s.lg.Info(
  120. "fetched snapshot",
  121. zap.String("endpoint", cfg.Endpoints[0]),
  122. zap.Duration("took", time.Since(now)),
  123. )
  124. if err = os.Rename(partpath, dbPath); err != nil {
  125. return fmt.Errorf("could not rename %s to %s (%v)", partpath, dbPath, err)
  126. }
  127. s.lg.Info("saved", zap.String("path", dbPath))
  128. return nil
  129. }
  130. // Status is the snapshot file status.
  131. type Status struct {
  132. Hash uint32 `json:"hash"`
  133. Revision int64 `json:"revision"`
  134. TotalKey int `json:"totalKey"`
  135. TotalSize int64 `json:"totalSize"`
  136. }
  137. // Status returns the snapshot file information.
  138. func (s *v3Manager) Status(dbPath string) (ds Status, err error) {
  139. if _, err = os.Stat(dbPath); err != nil {
  140. return ds, err
  141. }
  142. db, err := bolt.Open(dbPath, 0400, &bolt.Options{ReadOnly: true})
  143. if err != nil {
  144. return ds, err
  145. }
  146. defer db.Close()
  147. h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
  148. if err = db.View(func(tx *bolt.Tx) error {
  149. // check snapshot file integrity first
  150. var dbErrStrings []string
  151. for dbErr := range tx.Check() {
  152. dbErrStrings = append(dbErrStrings, dbErr.Error())
  153. }
  154. if len(dbErrStrings) > 0 {
  155. return fmt.Errorf("snapshot file integrity check failed. %d errors found.\n"+strings.Join(dbErrStrings, "\n"), len(dbErrStrings))
  156. }
  157. ds.TotalSize = tx.Size()
  158. c := tx.Cursor()
  159. for next, _ := c.First(); next != nil; next, _ = c.Next() {
  160. b := tx.Bucket(next)
  161. if b == nil {
  162. return fmt.Errorf("cannot get hash of bucket %s", string(next))
  163. }
  164. h.Write(next)
  165. iskeyb := (string(next) == "key")
  166. b.ForEach(func(k, v []byte) error {
  167. h.Write(k)
  168. h.Write(v)
  169. if iskeyb {
  170. rev := bytesToRev(k)
  171. ds.Revision = rev.main
  172. }
  173. ds.TotalKey++
  174. return nil
  175. })
  176. }
  177. return nil
  178. }); err != nil {
  179. return ds, err
  180. }
  181. ds.Hash = h.Sum32()
  182. return ds, nil
  183. }
  184. // RestoreConfig configures snapshot restore operation.
  185. type RestoreConfig struct {
  186. // SnapshotPath is the path of snapshot file to restore from.
  187. SnapshotPath string
  188. // Name is the human-readable name of this member.
  189. Name string
  190. // OutputDataDir is the target data directory to save restored data.
  191. // OutputDataDir should not conflict with existing etcd data directory.
  192. // If OutputDataDir already exists, it will return an error to prevent
  193. // unintended data directory overwrites.
  194. // If empty, defaults to "[Name].etcd" if not given.
  195. OutputDataDir string
  196. // OutputWALDir is the target WAL data directory.
  197. // If empty, defaults to "[OutputDataDir]/member/wal" if not given.
  198. OutputWALDir string
  199. // PeerURLs is a list of member's peer URLs to advertise to the rest of the cluster.
  200. PeerURLs []string
  201. // InitialCluster is the initial cluster configuration for restore bootstrap.
  202. InitialCluster string
  203. // InitialClusterToken is the initial cluster token for etcd cluster during restore bootstrap.
  204. InitialClusterToken string
  205. // SkipHashCheck is "true" to ignore snapshot integrity hash value
  206. // (required if copied from data directory).
  207. SkipHashCheck bool
  208. }
  209. // Restore restores a new etcd data directory from given snapshot file.
  210. func (s *v3Manager) Restore(cfg RestoreConfig) error {
  211. pURLs, err := types.NewURLs(cfg.PeerURLs)
  212. if err != nil {
  213. return err
  214. }
  215. var ics types.URLsMap
  216. ics, err = types.NewURLsMap(cfg.InitialCluster)
  217. if err != nil {
  218. return err
  219. }
  220. srv := etcdserver.ServerConfig{
  221. Logger: s.lg,
  222. Name: cfg.Name,
  223. PeerURLs: pURLs,
  224. InitialPeerURLsMap: ics,
  225. InitialClusterToken: cfg.InitialClusterToken,
  226. }
  227. if err = srv.VerifyBootstrap(); err != nil {
  228. return err
  229. }
  230. s.cl, err = membership.NewClusterFromURLsMap(s.lg, cfg.InitialClusterToken, ics)
  231. if err != nil {
  232. return err
  233. }
  234. dataDir := cfg.OutputDataDir
  235. if dataDir == "" {
  236. dataDir = cfg.Name + ".etcd"
  237. }
  238. if fileutil.Exist(dataDir) {
  239. return fmt.Errorf("data-dir %q exists", dataDir)
  240. }
  241. walDir := cfg.OutputWALDir
  242. if walDir == "" {
  243. walDir = filepath.Join(dataDir, "member", "wal")
  244. } else if fileutil.Exist(walDir) {
  245. return fmt.Errorf("wal-dir %q exists", walDir)
  246. }
  247. s.name = cfg.Name
  248. s.dbPath = cfg.SnapshotPath
  249. s.walDir = walDir
  250. s.snapDir = filepath.Join(dataDir, "member", "snap")
  251. s.skipHashCheck = cfg.SkipHashCheck
  252. s.lg.Info(
  253. "restoring snapshot",
  254. zap.String("path", s.dbPath),
  255. zap.String("wal-dir", s.walDir),
  256. zap.String("data-dir", dataDir),
  257. zap.String("snap-dir", s.snapDir),
  258. )
  259. if err = s.saveDB(); err != nil {
  260. return err
  261. }
  262. if err = s.saveWALAndSnap(); err != nil {
  263. return err
  264. }
  265. s.lg.Info(
  266. "restored snapshot",
  267. zap.String("path", s.dbPath),
  268. zap.String("wal-dir", s.walDir),
  269. zap.String("data-dir", dataDir),
  270. zap.String("snap-dir", s.snapDir),
  271. )
  272. return nil
  273. }
  274. // saveDB copies the database snapshot to the snapshot directory
  275. func (s *v3Manager) saveDB() error {
  276. f, ferr := os.OpenFile(s.dbPath, os.O_RDONLY, 0600)
  277. if ferr != nil {
  278. return ferr
  279. }
  280. defer f.Close()
  281. // get snapshot integrity hash
  282. if _, err := f.Seek(-sha256.Size, io.SeekEnd); err != nil {
  283. return err
  284. }
  285. sha := make([]byte, sha256.Size)
  286. if _, err := f.Read(sha); err != nil {
  287. return err
  288. }
  289. if _, err := f.Seek(0, io.SeekStart); err != nil {
  290. return err
  291. }
  292. if err := fileutil.CreateDirAll(s.snapDir); err != nil {
  293. return err
  294. }
  295. dbpath := filepath.Join(s.snapDir, "db")
  296. db, dberr := os.OpenFile(dbpath, os.O_RDWR|os.O_CREATE, 0600)
  297. if dberr != nil {
  298. return dberr
  299. }
  300. if _, err := io.Copy(db, f); err != nil {
  301. return err
  302. }
  303. // truncate away integrity hash, if any.
  304. off, serr := db.Seek(0, io.SeekEnd)
  305. if serr != nil {
  306. return serr
  307. }
  308. hasHash := (off % 512) == sha256.Size
  309. if hasHash {
  310. if err := db.Truncate(off - sha256.Size); err != nil {
  311. return err
  312. }
  313. }
  314. if !hasHash && !s.skipHashCheck {
  315. return fmt.Errorf("snapshot missing hash but --skip-hash-check=false")
  316. }
  317. if hasHash && !s.skipHashCheck {
  318. // check for match
  319. if _, err := db.Seek(0, io.SeekStart); err != nil {
  320. return err
  321. }
  322. h := sha256.New()
  323. if _, err := io.Copy(h, db); err != nil {
  324. return err
  325. }
  326. dbsha := h.Sum(nil)
  327. if !reflect.DeepEqual(sha, dbsha) {
  328. return fmt.Errorf("expected sha256 %v, got %v", sha, dbsha)
  329. }
  330. }
  331. // db hash is OK, can now modify DB so it can be part of a new cluster
  332. db.Close()
  333. commit := len(s.cl.Members())
  334. // update consistentIndex so applies go through on etcdserver despite
  335. // having a new raft instance
  336. be := backend.NewDefaultBackend(dbpath)
  337. // a lessor never timeouts leases
  338. lessor := lease.NewLessor(s.lg, be, lease.LessorConfig{MinLeaseTTL: math.MaxInt64})
  339. mvs := mvcc.NewStore(s.lg, be, lessor, (*initIndex)(&commit), mvcc.StoreConfig{CompactionBatchLimit: math.MaxInt32})
  340. txn := mvs.Write()
  341. btx := be.BatchTx()
  342. del := func(k, v []byte) error {
  343. txn.DeleteRange(k, nil)
  344. return nil
  345. }
  346. // delete stored members from old cluster since using new members
  347. btx.UnsafeForEach([]byte("members"), del)
  348. // todo: add back new members when we start to deprecate old snap file.
  349. btx.UnsafeForEach([]byte("members_removed"), del)
  350. // trigger write-out of new consistent index
  351. txn.End()
  352. mvs.Commit()
  353. mvs.Close()
  354. be.Close()
  355. return nil
  356. }
  357. // saveWALAndSnap creates a WAL for the initial cluster
  358. func (s *v3Manager) saveWALAndSnap() error {
  359. if err := fileutil.CreateDirAll(s.walDir); err != nil {
  360. return err
  361. }
  362. // add members again to persist them to the store we create.
  363. st := v2store.New(etcdserver.StoreClusterPrefix, etcdserver.StoreKeysPrefix)
  364. s.cl.SetStore(st)
  365. for _, m := range s.cl.Members() {
  366. s.cl.AddMember(m)
  367. }
  368. m := s.cl.MemberByName(s.name)
  369. md := &etcdserverpb.Metadata{NodeID: uint64(m.ID), ClusterID: uint64(s.cl.ID())}
  370. metadata, merr := md.Marshal()
  371. if merr != nil {
  372. return merr
  373. }
  374. w, walerr := wal.Create(s.lg, s.walDir, metadata)
  375. if walerr != nil {
  376. return walerr
  377. }
  378. defer w.Close()
  379. peers := make([]raft.Peer, len(s.cl.MemberIDs()))
  380. for i, id := range s.cl.MemberIDs() {
  381. ctx, err := json.Marshal((*s.cl).Member(id))
  382. if err != nil {
  383. return err
  384. }
  385. peers[i] = raft.Peer{ID: uint64(id), Context: ctx}
  386. }
  387. ents := make([]raftpb.Entry, len(peers))
  388. nodeIDs := make([]uint64, len(peers))
  389. for i, p := range peers {
  390. nodeIDs[i] = p.ID
  391. cc := raftpb.ConfChange{
  392. Type: raftpb.ConfChangeAddNode,
  393. NodeID: p.ID,
  394. Context: p.Context,
  395. }
  396. d, err := cc.Marshal()
  397. if err != nil {
  398. return err
  399. }
  400. ents[i] = raftpb.Entry{
  401. Type: raftpb.EntryConfChange,
  402. Term: 1,
  403. Index: uint64(i + 1),
  404. Data: d,
  405. }
  406. }
  407. commit, term := uint64(len(ents)), uint64(1)
  408. if err := w.Save(raftpb.HardState{
  409. Term: term,
  410. Vote: peers[0].ID,
  411. Commit: commit,
  412. }, ents); err != nil {
  413. return err
  414. }
  415. b, berr := st.Save()
  416. if berr != nil {
  417. return berr
  418. }
  419. raftSnap := raftpb.Snapshot{
  420. Data: b,
  421. Metadata: raftpb.SnapshotMetadata{
  422. Index: commit,
  423. Term: term,
  424. ConfState: raftpb.ConfState{
  425. Voters: nodeIDs,
  426. },
  427. },
  428. }
  429. sn := snap.New(s.lg, s.snapDir)
  430. if err := sn.SaveSnap(raftSnap); err != nil {
  431. return err
  432. }
  433. return w.SaveSnapshot(walpb.Snapshot{Index: commit, Term: term})
  434. }