watchable_store.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "log"
  17. "sync"
  18. "time"
  19. "github.com/coreos/etcd/lease"
  20. "github.com/coreos/etcd/mvcc/backend"
  21. "github.com/coreos/etcd/mvcc/mvccpb"
  22. )
  23. const (
  24. // chanBufLen is the length of the buffered chan
  25. // for sending out watched events.
  26. // TODO: find a good buf value. 1024 is just a random one that
  27. // seems to be reasonable.
  28. chanBufLen = 1024
  29. // maxWatchersPerSync is the number of watchers to sync in a single batch
  30. maxWatchersPerSync = 512
  31. )
  32. type watchable interface {
  33. watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc)
  34. progress(w *watcher)
  35. rev() int64
  36. }
  37. type watchableStore struct {
  38. mu sync.Mutex
  39. *store
  40. // contains all unsynced watchers that needs to sync with events that have happened
  41. unsynced watcherGroup
  42. // contains all synced watchers that are in sync with the progress of the store.
  43. // The key of the map is the key that the watcher watches on.
  44. synced watcherGroup
  45. stopc chan struct{}
  46. wg sync.WaitGroup
  47. }
  48. // cancelFunc updates unsynced and synced maps when running
  49. // cancel operations.
  50. type cancelFunc func()
  51. func New(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) ConsistentWatchableKV {
  52. return newWatchableStore(b, le, ig)
  53. }
  54. func newWatchableStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *watchableStore {
  55. s := &watchableStore{
  56. store: NewStore(b, le, ig),
  57. unsynced: newWatcherGroup(),
  58. synced: newWatcherGroup(),
  59. stopc: make(chan struct{}),
  60. }
  61. if s.le != nil {
  62. // use this store as the deleter so revokes trigger watch events
  63. s.le.SetRangeDeleter(s)
  64. }
  65. s.wg.Add(1)
  66. go s.syncWatchersLoop()
  67. return s
  68. }
  69. func (s *watchableStore) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
  70. s.mu.Lock()
  71. defer s.mu.Unlock()
  72. rev = s.store.Put(key, value, lease)
  73. changes := s.store.getChanges()
  74. if len(changes) != 1 {
  75. log.Panicf("unexpected len(changes) != 1 after put")
  76. }
  77. ev := mvccpb.Event{
  78. Type: mvccpb.PUT,
  79. Kv: &changes[0],
  80. }
  81. s.notify(rev, []mvccpb.Event{ev})
  82. return rev
  83. }
  84. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  85. s.mu.Lock()
  86. defer s.mu.Unlock()
  87. n, rev = s.store.DeleteRange(key, end)
  88. changes := s.store.getChanges()
  89. if len(changes) != int(n) {
  90. log.Panicf("unexpected len(changes) != n after deleteRange")
  91. }
  92. if n == 0 {
  93. return n, rev
  94. }
  95. evs := make([]mvccpb.Event, n)
  96. for i := range changes {
  97. evs[i] = mvccpb.Event{
  98. Type: mvccpb.DELETE,
  99. Kv: &changes[i]}
  100. evs[i].Kv.ModRevision = rev
  101. }
  102. s.notify(rev, evs)
  103. return n, rev
  104. }
  105. func (s *watchableStore) TxnBegin() int64 {
  106. s.mu.Lock()
  107. return s.store.TxnBegin()
  108. }
  109. func (s *watchableStore) TxnEnd(txnID int64) error {
  110. err := s.store.TxnEnd(txnID)
  111. if err != nil {
  112. return err
  113. }
  114. changes := s.getChanges()
  115. if len(changes) == 0 {
  116. s.mu.Unlock()
  117. return nil
  118. }
  119. rev := s.store.Rev()
  120. evs := make([]mvccpb.Event, len(changes))
  121. for i, change := range changes {
  122. switch change.CreateRevision {
  123. case 0:
  124. evs[i] = mvccpb.Event{
  125. Type: mvccpb.DELETE,
  126. Kv: &changes[i]}
  127. evs[i].Kv.ModRevision = rev
  128. default:
  129. evs[i] = mvccpb.Event{
  130. Type: mvccpb.PUT,
  131. Kv: &changes[i]}
  132. }
  133. }
  134. s.notify(rev, evs)
  135. s.mu.Unlock()
  136. return nil
  137. }
  138. func (s *watchableStore) Close() error {
  139. close(s.stopc)
  140. s.wg.Wait()
  141. return s.store.Close()
  142. }
  143. func (s *watchableStore) NewWatchStream() WatchStream {
  144. watchStreamGauge.Inc()
  145. return &watchStream{
  146. watchable: s,
  147. ch: make(chan WatchResponse, chanBufLen),
  148. cancels: make(map[WatchID]cancelFunc),
  149. watchers: make(map[WatchID]*watcher),
  150. }
  151. }
  152. func (s *watchableStore) watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) {
  153. s.mu.Lock()
  154. defer s.mu.Unlock()
  155. wa := &watcher{
  156. key: key,
  157. end: end,
  158. cur: startRev,
  159. id: id,
  160. ch: ch,
  161. }
  162. s.store.mu.Lock()
  163. synced := startRev > s.store.currentRev.main || startRev == 0
  164. if synced {
  165. wa.cur = s.store.currentRev.main + 1
  166. if startRev > wa.cur {
  167. wa.cur = startRev
  168. }
  169. }
  170. s.store.mu.Unlock()
  171. if synced {
  172. s.synced.add(wa)
  173. } else {
  174. slowWatcherGauge.Inc()
  175. s.unsynced.add(wa)
  176. }
  177. watcherGauge.Inc()
  178. cancel := cancelFunc(func() {
  179. s.mu.Lock()
  180. defer s.mu.Unlock()
  181. // remove references of the watcher
  182. if s.unsynced.delete(wa) {
  183. slowWatcherGauge.Dec()
  184. watcherGauge.Dec()
  185. return
  186. }
  187. if s.synced.delete(wa) {
  188. watcherGauge.Dec()
  189. }
  190. // If we cannot find it, it should have finished watch.
  191. })
  192. return wa, cancel
  193. }
  194. // syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
  195. func (s *watchableStore) syncWatchersLoop() {
  196. defer s.wg.Done()
  197. for {
  198. s.mu.Lock()
  199. st := time.Now()
  200. lastUnsyncedWatchers := s.unsynced.size()
  201. s.syncWatchers()
  202. unsyncedWatchers := s.unsynced.size()
  203. s.mu.Unlock()
  204. syncDuration := time.Since(st)
  205. waitDuration := 100 * time.Millisecond
  206. // more work pending?
  207. if unsyncedWatchers != 0 && lastUnsyncedWatchers > unsyncedWatchers {
  208. // be fair to other store operations by yielding time taken
  209. waitDuration = syncDuration
  210. }
  211. select {
  212. case <-time.After(waitDuration):
  213. case <-s.stopc:
  214. return
  215. }
  216. }
  217. }
  218. // syncWatchers syncs unsynced watchers by:
  219. // 1. choose a set of watchers from the unsynced watcher group
  220. // 2. iterate over the set to get the minimum revision and remove compacted watchers
  221. // 3. use minimum revision to get all key-value pairs and send those events to watchers
  222. // 4. remove synced watchers in set from unsynced group and move to synced group
  223. func (s *watchableStore) syncWatchers() {
  224. if s.unsynced.size() == 0 {
  225. return
  226. }
  227. s.store.mu.Lock()
  228. defer s.store.mu.Unlock()
  229. // in order to find key-value pairs from unsynced watchers, we need to
  230. // find min revision index, and these revisions can be used to
  231. // query the backend store of key-value pairs
  232. curRev := s.store.currentRev.main
  233. compactionRev := s.store.compactMainRev
  234. wg, minRev := s.unsynced.choose(maxWatchersPerSync, curRev, compactionRev)
  235. minBytes, maxBytes := newRevBytes(), newRevBytes()
  236. revToBytes(revision{main: minRev}, minBytes)
  237. revToBytes(revision{main: curRev + 1}, maxBytes)
  238. // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
  239. // values are actual key-value pairs in backend.
  240. tx := s.store.b.BatchTx()
  241. tx.Lock()
  242. revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
  243. evs := kvsToEvents(wg, revs, vs)
  244. tx.Unlock()
  245. wb := newWatcherBatch(wg, evs)
  246. for w := range wg.watchers {
  247. eb, ok := wb[w]
  248. if !ok {
  249. // bring un-notified watcher to synced
  250. w.cur = curRev
  251. s.synced.add(w)
  252. s.unsynced.delete(w)
  253. continue
  254. }
  255. select {
  256. case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: curRev}:
  257. pendingEventsGauge.Add(float64(len(eb.evs)))
  258. default:
  259. // TODO: handle the full unsynced watchers.
  260. // continue to process other watchers for now, the full ones
  261. // will be processed next time and hopefully it will not be full.
  262. continue
  263. }
  264. if eb.moreRev != 0 {
  265. w.cur = eb.moreRev
  266. continue
  267. }
  268. w.cur = curRev
  269. s.synced.add(w)
  270. s.unsynced.delete(w)
  271. }
  272. slowWatcherGauge.Set(float64(s.unsynced.size()))
  273. }
  274. // kvsToEvents gets all events for the watchers from all key-value pairs
  275. func kvsToEvents(wg *watcherGroup, revs, vals [][]byte) (evs []mvccpb.Event) {
  276. for i, v := range vals {
  277. var kv mvccpb.KeyValue
  278. if err := kv.Unmarshal(v); err != nil {
  279. log.Panicf("mvcc: cannot unmarshal event: %v", err)
  280. }
  281. if !wg.contains(string(kv.Key)) {
  282. continue
  283. }
  284. ty := mvccpb.PUT
  285. if isTombstone(revs[i]) {
  286. ty = mvccpb.DELETE
  287. // patch in mod revision so watchers won't skip
  288. kv.ModRevision = bytesToRev(revs[i]).main
  289. }
  290. evs = append(evs, mvccpb.Event{Kv: &kv, Type: ty})
  291. }
  292. return evs
  293. }
  294. // notify notifies the fact that given event at the given rev just happened to
  295. // watchers that watch on the key of the event.
  296. func (s *watchableStore) notify(rev int64, evs []mvccpb.Event) {
  297. for w, eb := range newWatcherBatch(&s.synced, evs) {
  298. if eb.revs != 1 {
  299. log.Panicf("mvcc: unexpected multiple revisions in notification")
  300. }
  301. select {
  302. case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: s.Rev()}:
  303. pendingEventsGauge.Add(float64(len(eb.evs)))
  304. default:
  305. // move slow watcher to unsynced
  306. w.cur = rev
  307. s.unsynced.add(w)
  308. s.synced.delete(w)
  309. slowWatcherGauge.Inc()
  310. }
  311. }
  312. }
  313. func (s *watchableStore) rev() int64 { return s.store.Rev() }
  314. func (s *watchableStore) progress(w *watcher) {
  315. s.mu.Lock()
  316. defer s.mu.Unlock()
  317. if _, ok := s.synced.watchers[w]; ok {
  318. select {
  319. case w.ch <- WatchResponse{WatchID: w.id, Revision: s.rev()}:
  320. default:
  321. // If the ch is full, this watcher is receiving events.
  322. // We do not need to send progress at all.
  323. }
  324. }
  325. }
  326. type watcher struct {
  327. // the watcher key
  328. key []byte
  329. // end indicates the end of the range to watch.
  330. // If end is set, the watcher is on a range.
  331. end []byte
  332. // cur is the current watcher revision of a unsynced watcher.
  333. // cur will be updated for unsynced watcher while it is catching up.
  334. // cur is startRev of a synced watcher.
  335. // cur will not be updated for synced watcher.
  336. cur int64
  337. id WatchID
  338. // a chan to send out the watch response.
  339. // The chan might be shared with other watchers.
  340. ch chan<- WatchResponse
  341. }