backend.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package backend
  15. import (
  16. "fmt"
  17. "hash/crc32"
  18. "io"
  19. "io/ioutil"
  20. "log"
  21. "os"
  22. "path"
  23. "sync"
  24. "sync/atomic"
  25. "time"
  26. "github.com/boltdb/bolt"
  27. )
  28. var (
  29. defaultBatchLimit = 10000
  30. defaultBatchInterval = 100 * time.Millisecond
  31. defragLimit = 10000
  32. // InitialMmapSize is the initial size of the mmapped region. Setting this larger than
  33. // the potential max db size can prevent writer from blocking reader.
  34. // This only works for linux.
  35. InitialMmapSize = int64(10 * 1024 * 1024 * 1024)
  36. )
  37. const (
  38. // DefaultQuotaBytes is the number of bytes the backend Size may
  39. // consume before exceeding the space quota.
  40. DefaultQuotaBytes = int64(2 * 1024 * 1024 * 1024) // 2GB
  41. // MaxQuotaBytes is the maximum number of bytes suggested for a backend
  42. // quota. A larger quota may lead to degraded performance.
  43. MaxQuotaBytes = int64(8 * 1024 * 1024 * 1024) // 8GB
  44. )
  45. type Backend interface {
  46. BatchTx() BatchTx
  47. Snapshot() Snapshot
  48. Hash() (uint32, error)
  49. // Size returns the current size of the backend.
  50. Size() int64
  51. Defrag() error
  52. ForceCommit()
  53. Close() error
  54. }
  55. type Snapshot interface {
  56. // Size gets the size of the snapshot.
  57. Size() int64
  58. // WriteTo writes the snapshot into the given writer.
  59. WriteTo(w io.Writer) (n int64, err error)
  60. // Close closes the snapshot.
  61. Close() error
  62. }
  63. type backend struct {
  64. // size and commits are used with atomic operations so they must be
  65. // 64-bit aligned, otherwise 32-bit tests will crash
  66. // size is the number of bytes in the backend
  67. size int64
  68. // commits counts number of commits since start
  69. commits int64
  70. mu sync.RWMutex
  71. db *bolt.DB
  72. batchInterval time.Duration
  73. batchLimit int
  74. batchTx *batchTx
  75. stopc chan struct{}
  76. donec chan struct{}
  77. }
  78. func New(path string, d time.Duration, limit int) Backend {
  79. return newBackend(path, d, limit)
  80. }
  81. func NewDefaultBackend(path string) Backend {
  82. return newBackend(path, defaultBatchInterval, defaultBatchLimit)
  83. }
  84. func newBackend(path string, d time.Duration, limit int) *backend {
  85. db, err := bolt.Open(path, 0600, boltOpenOptions)
  86. if err != nil {
  87. log.Panicf("backend: cannot open database at %s (%v)", path, err)
  88. }
  89. b := &backend{
  90. db: db,
  91. batchInterval: d,
  92. batchLimit: limit,
  93. stopc: make(chan struct{}),
  94. donec: make(chan struct{}),
  95. }
  96. b.batchTx = newBatchTx(b)
  97. go b.run()
  98. return b
  99. }
  100. // BatchTx returns the current batch tx in coalescer. The tx can be used for read and
  101. // write operations. The write result can be retrieved within the same tx immediately.
  102. // The write result is isolated with other txs until the current one get committed.
  103. func (b *backend) BatchTx() BatchTx {
  104. return b.batchTx
  105. }
  106. // ForceCommit forces the current batching tx to commit.
  107. func (b *backend) ForceCommit() {
  108. b.batchTx.Commit()
  109. }
  110. func (b *backend) Snapshot() Snapshot {
  111. b.batchTx.Commit()
  112. b.mu.RLock()
  113. defer b.mu.RUnlock()
  114. tx, err := b.db.Begin(false)
  115. if err != nil {
  116. log.Fatalf("backend: cannot begin tx (%s)", err)
  117. }
  118. return &snapshot{tx}
  119. }
  120. func (b *backend) Hash() (uint32, error) {
  121. h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
  122. b.mu.RLock()
  123. defer b.mu.RUnlock()
  124. err := b.db.View(func(tx *bolt.Tx) error {
  125. c := tx.Cursor()
  126. for next, _ := c.First(); next != nil; next, _ = c.Next() {
  127. b := tx.Bucket(next)
  128. if b == nil {
  129. return fmt.Errorf("cannot get hash of bucket %s", string(next))
  130. }
  131. h.Write(next)
  132. b.ForEach(func(k, v []byte) error {
  133. h.Write(k)
  134. h.Write(v)
  135. return nil
  136. })
  137. }
  138. return nil
  139. })
  140. if err != nil {
  141. return 0, err
  142. }
  143. return h.Sum32(), nil
  144. }
  145. func (b *backend) Size() int64 {
  146. return atomic.LoadInt64(&b.size)
  147. }
  148. func (b *backend) run() {
  149. defer close(b.donec)
  150. for {
  151. select {
  152. case <-time.After(b.batchInterval):
  153. case <-b.stopc:
  154. b.batchTx.CommitAndStop()
  155. return
  156. }
  157. b.batchTx.Commit()
  158. }
  159. }
  160. func (b *backend) Close() error {
  161. close(b.stopc)
  162. <-b.donec
  163. return b.db.Close()
  164. }
  165. // Commits returns total number of commits since start
  166. func (b *backend) Commits() int64 {
  167. return atomic.LoadInt64(&b.commits)
  168. }
  169. func (b *backend) Defrag() error {
  170. // TODO: make this non-blocking?
  171. // lock batchTx to ensure nobody is using previous tx, and then
  172. // close previous ongoing tx.
  173. b.batchTx.Lock()
  174. defer b.batchTx.Unlock()
  175. // lock database after lock tx to avoid deadlock.
  176. b.mu.Lock()
  177. defer b.mu.Unlock()
  178. b.batchTx.commit(true)
  179. b.batchTx.tx = nil
  180. tmpdb, err := bolt.Open(b.db.Path()+".tmp", 0600, boltOpenOptions)
  181. if err != nil {
  182. return err
  183. }
  184. err = defragdb(b.db, tmpdb, defragLimit)
  185. if err != nil {
  186. tmpdb.Close()
  187. os.RemoveAll(tmpdb.Path())
  188. return err
  189. }
  190. dbp := b.db.Path()
  191. tdbp := tmpdb.Path()
  192. err = b.db.Close()
  193. if err != nil {
  194. log.Fatalf("backend: cannot close database (%s)", err)
  195. }
  196. err = tmpdb.Close()
  197. if err != nil {
  198. log.Fatalf("backend: cannot close database (%s)", err)
  199. }
  200. err = os.Rename(tdbp, dbp)
  201. if err != nil {
  202. log.Fatalf("backend: cannot rename database (%s)", err)
  203. }
  204. b.db, err = bolt.Open(dbp, 0600, boltOpenOptions)
  205. if err != nil {
  206. log.Panicf("backend: cannot open database at %s (%v)", dbp, err)
  207. }
  208. b.batchTx.tx, err = b.db.Begin(true)
  209. if err != nil {
  210. log.Fatalf("backend: cannot begin tx (%s)", err)
  211. }
  212. // commit to update metadata like db.size
  213. b.batchTx.commit(false)
  214. return nil
  215. }
  216. func defragdb(odb, tmpdb *bolt.DB, limit int) error {
  217. // open a tx on tmpdb for writes
  218. tmptx, err := tmpdb.Begin(true)
  219. if err != nil {
  220. return err
  221. }
  222. // open a tx on old db for read
  223. tx, err := odb.Begin(false)
  224. if err != nil {
  225. return err
  226. }
  227. defer tx.Rollback()
  228. c := tx.Cursor()
  229. count := 0
  230. for next, _ := c.First(); next != nil; next, _ = c.Next() {
  231. b := tx.Bucket(next)
  232. if b == nil {
  233. return fmt.Errorf("backend: cannot defrag bucket %s", string(next))
  234. }
  235. tmpb, berr := tmptx.CreateBucketIfNotExists(next)
  236. if berr != nil {
  237. return berr
  238. }
  239. b.ForEach(func(k, v []byte) error {
  240. count++
  241. if count > limit {
  242. err = tmptx.Commit()
  243. if err != nil {
  244. return err
  245. }
  246. tmptx, err = tmpdb.Begin(true)
  247. if err != nil {
  248. return err
  249. }
  250. tmpb = tmptx.Bucket(next)
  251. count = 0
  252. }
  253. return tmpb.Put(k, v)
  254. })
  255. }
  256. return tmptx.Commit()
  257. }
  258. // NewTmpBackend creates a backend implementation for testing.
  259. func NewTmpBackend(batchInterval time.Duration, batchLimit int) (*backend, string) {
  260. dir, err := ioutil.TempDir(os.TempDir(), "etcd_backend_test")
  261. if err != nil {
  262. log.Fatal(err)
  263. }
  264. tmpPath := path.Join(dir, "database")
  265. return newBackend(tmpPath, batchInterval, batchLimit), tmpPath
  266. }
  267. func NewDefaultTmpBackend() (*backend, string) {
  268. return NewTmpBackend(defaultBatchInterval, defaultBatchLimit)
  269. }
  270. type snapshot struct {
  271. *bolt.Tx
  272. }
  273. func (s *snapshot) Close() error { return s.Tx.Rollback() }