watchable_store.go 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "log"
  17. "sync"
  18. "time"
  19. "github.com/coreos/etcd/lease"
  20. "github.com/coreos/etcd/storage/backend"
  21. "github.com/coreos/etcd/storage/storagepb"
  22. )
  23. const (
  24. // chanBufLen is the length of the buffered chan
  25. // for sending out watched events.
  26. // TODO: find a good buf value. 1024 is just a random one that
  27. // seems to be reasonable.
  28. chanBufLen = 1024
  29. )
  30. type watchable interface {
  31. watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc)
  32. progress(w *watcher)
  33. rev() int64
  34. }
  35. type watchableStore struct {
  36. mu sync.Mutex
  37. *store
  38. // contains all unsynced watchers that needs to sync with events that have happened
  39. unsynced watcherGroup
  40. // contains all synced watchers that are in sync with the progress of the store.
  41. // The key of the map is the key that the watcher watches on.
  42. synced watcherGroup
  43. stopc chan struct{}
  44. wg sync.WaitGroup
  45. }
  46. // cancelFunc updates unsynced and synced maps when running
  47. // cancel operations.
  48. type cancelFunc func()
  49. func New(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) ConsistentWatchableKV {
  50. return newWatchableStore(b, le, ig)
  51. }
  52. func newWatchableStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *watchableStore {
  53. s := &watchableStore{
  54. store: NewStore(b, le, ig),
  55. unsynced: newWatcherGroup(),
  56. synced: newWatcherGroup(),
  57. stopc: make(chan struct{}),
  58. }
  59. if s.le != nil {
  60. // use this store as the deleter so revokes trigger watch events
  61. s.le.SetRangeDeleter(s)
  62. }
  63. s.wg.Add(1)
  64. go s.syncWatchersLoop()
  65. return s
  66. }
  67. func (s *watchableStore) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
  68. s.mu.Lock()
  69. defer s.mu.Unlock()
  70. rev = s.store.Put(key, value, lease)
  71. changes := s.store.getChanges()
  72. if len(changes) != 1 {
  73. log.Panicf("unexpected len(changes) != 1 after put")
  74. }
  75. ev := storagepb.Event{
  76. Type: storagepb.PUT,
  77. Kv: &changes[0],
  78. }
  79. s.notify(rev, []storagepb.Event{ev})
  80. return rev
  81. }
  82. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  83. s.mu.Lock()
  84. defer s.mu.Unlock()
  85. n, rev = s.store.DeleteRange(key, end)
  86. changes := s.store.getChanges()
  87. if len(changes) != int(n) {
  88. log.Panicf("unexpected len(changes) != n after deleteRange")
  89. }
  90. if n == 0 {
  91. return n, rev
  92. }
  93. evs := make([]storagepb.Event, n)
  94. for i, change := range changes {
  95. evs[i] = storagepb.Event{
  96. Type: storagepb.DELETE,
  97. Kv: &change}
  98. evs[i].Kv.ModRevision = rev
  99. }
  100. s.notify(rev, evs)
  101. return n, rev
  102. }
  103. func (s *watchableStore) TxnBegin() int64 {
  104. s.mu.Lock()
  105. return s.store.TxnBegin()
  106. }
  107. func (s *watchableStore) TxnEnd(txnID int64) error {
  108. err := s.store.TxnEnd(txnID)
  109. if err != nil {
  110. return err
  111. }
  112. changes := s.getChanges()
  113. if len(changes) == 0 {
  114. s.mu.Unlock()
  115. return nil
  116. }
  117. rev := s.store.Rev()
  118. evs := make([]storagepb.Event, len(changes))
  119. for i, change := range changes {
  120. switch change.CreateRevision {
  121. case 0:
  122. evs[i] = storagepb.Event{
  123. Type: storagepb.DELETE,
  124. Kv: &changes[i]}
  125. evs[i].Kv.ModRevision = rev
  126. default:
  127. evs[i] = storagepb.Event{
  128. Type: storagepb.PUT,
  129. Kv: &changes[i]}
  130. }
  131. }
  132. s.notify(rev, evs)
  133. s.mu.Unlock()
  134. return nil
  135. }
  136. func (s *watchableStore) Close() error {
  137. close(s.stopc)
  138. s.wg.Wait()
  139. return s.store.Close()
  140. }
  141. func (s *watchableStore) NewWatchStream() WatchStream {
  142. watchStreamGauge.Inc()
  143. return &watchStream{
  144. watchable: s,
  145. ch: make(chan WatchResponse, chanBufLen),
  146. cancels: make(map[WatchID]cancelFunc),
  147. watchers: make(map[WatchID]*watcher),
  148. }
  149. }
  150. func (s *watchableStore) watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) {
  151. s.mu.Lock()
  152. defer s.mu.Unlock()
  153. wa := &watcher{
  154. key: key,
  155. end: end,
  156. cur: startRev,
  157. id: id,
  158. ch: ch,
  159. }
  160. s.store.mu.Lock()
  161. synced := startRev > s.store.currentRev.main || startRev == 0
  162. if synced {
  163. wa.cur = s.store.currentRev.main + 1
  164. if startRev > wa.cur {
  165. wa.cur = startRev
  166. }
  167. }
  168. s.store.mu.Unlock()
  169. if synced {
  170. s.synced.add(wa)
  171. } else {
  172. slowWatcherGauge.Inc()
  173. s.unsynced.add(wa)
  174. }
  175. watcherGauge.Inc()
  176. cancel := cancelFunc(func() {
  177. s.mu.Lock()
  178. defer s.mu.Unlock()
  179. // remove references of the watcher
  180. if s.unsynced.delete(wa) {
  181. slowWatcherGauge.Dec()
  182. watcherGauge.Dec()
  183. return
  184. }
  185. if s.synced.delete(wa) {
  186. watcherGauge.Dec()
  187. }
  188. // If we cannot find it, it should have finished watch.
  189. })
  190. return wa, cancel
  191. }
  192. // syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
  193. func (s *watchableStore) syncWatchersLoop() {
  194. defer s.wg.Done()
  195. for {
  196. s.mu.Lock()
  197. s.syncWatchers()
  198. s.mu.Unlock()
  199. select {
  200. case <-time.After(100 * time.Millisecond):
  201. case <-s.stopc:
  202. return
  203. }
  204. }
  205. }
  206. // syncWatchers periodically syncs unsynced watchers by: Iterate all unsynced
  207. // watchers to get the minimum revision within its range, skipping the
  208. // watcher if its current revision is behind the compact revision of the
  209. // store. And use this minimum revision to get all key-value pairs. Then send
  210. // those events to watchers.
  211. func (s *watchableStore) syncWatchers() {
  212. s.store.mu.Lock()
  213. defer s.store.mu.Unlock()
  214. if s.unsynced.size() == 0 {
  215. return
  216. }
  217. // in order to find key-value pairs from unsynced watchers, we need to
  218. // find min revision index, and these revisions can be used to
  219. // query the backend store of key-value pairs
  220. curRev := s.store.currentRev.main
  221. compactionRev := s.store.compactMainRev
  222. minRev := s.unsynced.scanMinRev(curRev, compactionRev)
  223. minBytes, maxBytes := newRevBytes(), newRevBytes()
  224. revToBytes(revision{main: minRev}, minBytes)
  225. revToBytes(revision{main: curRev + 1}, maxBytes)
  226. // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
  227. // values are actual key-value pairs in backend.
  228. tx := s.store.b.BatchTx()
  229. tx.Lock()
  230. revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
  231. evs := kvsToEvents(&s.unsynced, revs, vs)
  232. tx.Unlock()
  233. wb := newWatcherBatch(&s.unsynced, evs)
  234. for w, eb := range wb {
  235. select {
  236. // s.store.Rev also uses Lock, so just return directly
  237. case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: s.store.currentRev.main}:
  238. pendingEventsGauge.Add(float64(len(eb.evs)))
  239. default:
  240. // TODO: handle the full unsynced watchers.
  241. // continue to process other watchers for now, the full ones
  242. // will be processed next time and hopefully it will not be full.
  243. continue
  244. }
  245. if eb.moreRev != 0 {
  246. w.cur = eb.moreRev
  247. continue
  248. }
  249. w.cur = curRev
  250. s.synced.add(w)
  251. s.unsynced.delete(w)
  252. }
  253. // bring all un-notified watchers to synced.
  254. for w := range s.unsynced.watchers {
  255. if !wb.contains(w) {
  256. w.cur = curRev
  257. s.synced.add(w)
  258. s.unsynced.delete(w)
  259. }
  260. }
  261. slowWatcherGauge.Set(float64(s.unsynced.size()))
  262. }
  263. // kvsToEvents gets all events for the watchers from all key-value pairs
  264. func kvsToEvents(wg *watcherGroup, revs, vals [][]byte) (evs []storagepb.Event) {
  265. for i, v := range vals {
  266. var kv storagepb.KeyValue
  267. if err := kv.Unmarshal(v); err != nil {
  268. log.Panicf("storage: cannot unmarshal event: %v", err)
  269. }
  270. if !wg.contains(string(kv.Key)) {
  271. continue
  272. }
  273. ty := storagepb.PUT
  274. if isTombstone(revs[i]) {
  275. ty = storagepb.DELETE
  276. // patch in mod revision so watchers won't skip
  277. kv.ModRevision = bytesToRev(revs[i]).main
  278. }
  279. evs = append(evs, storagepb.Event{Kv: &kv, Type: ty})
  280. }
  281. return evs
  282. }
  283. // notify notifies the fact that given event at the given rev just happened to
  284. // watchers that watch on the key of the event.
  285. func (s *watchableStore) notify(rev int64, evs []storagepb.Event) {
  286. for w, eb := range newWatcherBatch(&s.synced, evs) {
  287. if eb.revs != 1 {
  288. panic("unexpected multiple revisions in notification")
  289. }
  290. select {
  291. case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: s.Rev()}:
  292. pendingEventsGauge.Add(float64(len(eb.evs)))
  293. default:
  294. // move slow watcher to unsynced
  295. w.cur = rev
  296. s.unsynced.add(w)
  297. s.synced.delete(w)
  298. slowWatcherGauge.Inc()
  299. }
  300. }
  301. }
  302. func (s *watchableStore) rev() int64 { return s.store.Rev() }
  303. func (s *watchableStore) progress(w *watcher) {
  304. s.mu.Lock()
  305. defer s.mu.Unlock()
  306. if _, ok := s.synced.watchers[w]; ok {
  307. select {
  308. case w.ch <- WatchResponse{WatchID: w.id, Revision: s.rev()}:
  309. default:
  310. // If the ch is full, this watcher is receiving events.
  311. // We do not need to send progress at all.
  312. }
  313. }
  314. }
  315. type watcher struct {
  316. // the watcher key
  317. key []byte
  318. // end indicates the end of the range to watch.
  319. // If end is set, the watcher is on a range.
  320. end []byte
  321. // cur is the current watcher revision of a unsynced watcher.
  322. // cur will be updated for unsynced watcher while it is catching up.
  323. // cur is startRev of a synced watcher.
  324. // cur will not be updated for synced watcher.
  325. cur int64
  326. id WatchID
  327. // a chan to send out the watch response.
  328. // The chan might be shared with other watchers.
  329. ch chan<- WatchResponse
  330. }