watchable_store.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "log"
  17. "math"
  18. "strings"
  19. "sync"
  20. "time"
  21. "github.com/coreos/etcd/lease"
  22. "github.com/coreos/etcd/storage/backend"
  23. "github.com/coreos/etcd/storage/storagepb"
  24. )
  25. const (
  26. // chanBufLen is the length of the buffered chan
  27. // for sending out watched events.
  28. // TODO: find a good buf value. 1024 is just a random one that
  29. // seems to be reasonable.
  30. chanBufLen = 1024
  31. )
  32. type (
  33. watcherSetByKey map[string]watcherSet
  34. watcherSet map[*watcher]struct{}
  35. )
  36. func (w watcherSet) add(wa *watcher) {
  37. if _, ok := w[wa]; ok {
  38. panic("add watcher twice!")
  39. }
  40. w[wa] = struct{}{}
  41. }
  42. func (w watcherSetByKey) add(wa *watcher) {
  43. set := w[string(wa.key)]
  44. if set == nil {
  45. set = make(watcherSet)
  46. w[string(wa.key)] = set
  47. }
  48. set.add(wa)
  49. }
  50. func (w watcherSetByKey) getSetByKey(key string) (watcherSet, bool) {
  51. set, ok := w[key]
  52. return set, ok
  53. }
  54. func (w watcherSetByKey) delete(wa *watcher) bool {
  55. k := string(wa.key)
  56. if v, ok := w[k]; ok {
  57. if _, ok := v[wa]; ok {
  58. delete(v, wa)
  59. // if there is nothing in the set,
  60. // remove the set
  61. if len(v) == 0 {
  62. delete(w, k)
  63. }
  64. return true
  65. }
  66. }
  67. return false
  68. }
  69. type watchable interface {
  70. watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc)
  71. rev() int64
  72. }
  73. type watchableStore struct {
  74. mu sync.Mutex
  75. *store
  76. // contains all unsynced watchers that needs to sync with events that have happened
  77. unsynced watcherSetByKey
  78. // contains all synced watchers that are in sync with the progress of the store.
  79. // The key of the map is the key that the watcher watches on.
  80. synced watcherSetByKey
  81. stopc chan struct{}
  82. wg sync.WaitGroup
  83. }
  84. // cancelFunc updates unsynced and synced maps when running
  85. // cancel operations.
  86. type cancelFunc func()
  87. func newWatchableStore(b backend.Backend, le lease.Lessor) *watchableStore {
  88. s := &watchableStore{
  89. store: NewStore(b, le),
  90. unsynced: make(watcherSetByKey),
  91. synced: make(watcherSetByKey),
  92. stopc: make(chan struct{}),
  93. }
  94. if s.le != nil {
  95. // use this store as the deleter so revokes trigger watch events
  96. s.le.SetRangeDeleter(s)
  97. }
  98. s.wg.Add(1)
  99. go s.syncWatchersLoop()
  100. return s
  101. }
  102. func (s *watchableStore) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
  103. s.mu.Lock()
  104. defer s.mu.Unlock()
  105. rev = s.store.Put(key, value, lease)
  106. changes := s.store.getChanges()
  107. if len(changes) != 1 {
  108. log.Panicf("unexpected len(changes) != 1 after put")
  109. }
  110. ev := storagepb.Event{
  111. Type: storagepb.PUT,
  112. Kv: &changes[0],
  113. }
  114. s.notify(rev, []storagepb.Event{ev})
  115. return rev
  116. }
  117. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  118. s.mu.Lock()
  119. defer s.mu.Unlock()
  120. n, rev = s.store.DeleteRange(key, end)
  121. changes := s.store.getChanges()
  122. if len(changes) != int(n) {
  123. log.Panicf("unexpected len(changes) != n after deleteRange")
  124. }
  125. if n == 0 {
  126. return n, rev
  127. }
  128. evs := make([]storagepb.Event, n)
  129. for i, change := range changes {
  130. evs[i] = storagepb.Event{
  131. Type: storagepb.DELETE,
  132. Kv: &change}
  133. evs[i].Kv.ModRevision = rev
  134. }
  135. s.notify(rev, evs)
  136. return n, rev
  137. }
  138. func (s *watchableStore) TxnBegin() int64 {
  139. s.mu.Lock()
  140. return s.store.TxnBegin()
  141. }
  142. func (s *watchableStore) TxnEnd(txnID int64) error {
  143. err := s.store.TxnEnd(txnID)
  144. if err != nil {
  145. return err
  146. }
  147. changes := s.getChanges()
  148. if len(changes) == 0 {
  149. s.mu.Unlock()
  150. return nil
  151. }
  152. rev := s.store.Rev()
  153. evs := make([]storagepb.Event, len(changes))
  154. for i, change := range changes {
  155. switch change.Value {
  156. case nil:
  157. evs[i] = storagepb.Event{
  158. Type: storagepb.DELETE,
  159. Kv: &changes[i]}
  160. evs[i].Kv.ModRevision = rev
  161. default:
  162. evs[i] = storagepb.Event{
  163. Type: storagepb.PUT,
  164. Kv: &changes[i]}
  165. }
  166. }
  167. s.notify(rev, evs)
  168. s.mu.Unlock()
  169. return nil
  170. }
  171. func (s *watchableStore) Close() error {
  172. close(s.stopc)
  173. s.wg.Wait()
  174. return s.store.Close()
  175. }
  176. func (s *watchableStore) NewWatchStream() WatchStream {
  177. watchStreamGauge.Inc()
  178. return &watchStream{
  179. watchable: s,
  180. ch: make(chan WatchResponse, chanBufLen),
  181. cancels: make(map[WatchID]cancelFunc),
  182. }
  183. }
  184. func (s *watchableStore) watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) {
  185. s.mu.Lock()
  186. defer s.mu.Unlock()
  187. wa := &watcher{
  188. key: key,
  189. prefix: prefix,
  190. cur: startRev,
  191. id: id,
  192. ch: ch,
  193. }
  194. s.store.mu.Lock()
  195. synced := startRev > s.store.currentRev.main || startRev == 0
  196. if synced {
  197. wa.cur = s.store.currentRev.main + 1
  198. }
  199. s.store.mu.Unlock()
  200. if synced {
  201. if startRev > wa.cur {
  202. panic("can't watch past sync revision")
  203. }
  204. s.synced.add(wa)
  205. } else {
  206. slowWatcherGauge.Inc()
  207. s.unsynced.add(wa)
  208. }
  209. watcherGauge.Inc()
  210. cancel := cancelFunc(func() {
  211. s.mu.Lock()
  212. defer s.mu.Unlock()
  213. // remove references of the watcher
  214. if s.unsynced.delete(wa) {
  215. slowWatcherGauge.Dec()
  216. watcherGauge.Dec()
  217. return
  218. }
  219. if s.synced.delete(wa) {
  220. watcherGauge.Dec()
  221. }
  222. // If we cannot find it, it should have finished watch.
  223. })
  224. return wa, cancel
  225. }
  226. // syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
  227. func (s *watchableStore) syncWatchersLoop() {
  228. defer s.wg.Done()
  229. for {
  230. s.mu.Lock()
  231. s.syncWatchers()
  232. s.mu.Unlock()
  233. select {
  234. case <-time.After(100 * time.Millisecond):
  235. case <-s.stopc:
  236. return
  237. }
  238. }
  239. }
  240. // syncWatchers periodically syncs unsynced watchers by: Iterate all unsynced
  241. // watchers to get the minimum revision within its range, skipping the
  242. // watcher if its current revision is behind the compact revision of the
  243. // store. And use this minimum revision to get all key-value pairs. Then send
  244. // those events to watchers.
  245. func (s *watchableStore) syncWatchers() {
  246. s.store.mu.Lock()
  247. defer s.store.mu.Unlock()
  248. if len(s.unsynced) == 0 {
  249. return
  250. }
  251. // in order to find key-value pairs from unsynced watchers, we need to
  252. // find min revision index, and these revisions can be used to
  253. // query the backend store of key-value pairs
  254. prefixes, minRev := s.scanUnsync()
  255. curRev := s.store.currentRev.main
  256. minBytes, maxBytes := newRevBytes(), newRevBytes()
  257. revToBytes(revision{main: minRev}, minBytes)
  258. revToBytes(revision{main: curRev + 1}, maxBytes)
  259. // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
  260. // values are actual key-value pairs in backend.
  261. tx := s.store.b.BatchTx()
  262. tx.Lock()
  263. revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
  264. evs := kvsToEvents(revs, vs, s.unsynced, prefixes)
  265. tx.Unlock()
  266. for w, es := range newWatcherToEventMap(s.unsynced, evs) {
  267. select {
  268. // s.store.Rev also uses Lock, so just return directly
  269. case w.ch <- WatchResponse{WatchID: w.id, Events: es, Revision: s.store.currentRev.main}:
  270. pendingEventsGauge.Add(float64(len(es)))
  271. default:
  272. // TODO: handle the full unsynced watchers.
  273. // continue to process other watchers for now, the full ones
  274. // will be processed next time and hopefully it will not be full.
  275. continue
  276. }
  277. w.cur = curRev
  278. s.synced.add(w)
  279. s.unsynced.delete(w)
  280. }
  281. slowWatcherGauge.Set(float64(len(s.unsynced)))
  282. }
  283. func (s *watchableStore) scanUnsync() (prefixes map[string]struct{}, minRev int64) {
  284. curRev := s.store.currentRev.main
  285. compactionRev := s.store.compactMainRev
  286. prefixes = make(map[string]struct{})
  287. minRev = int64(math.MaxInt64)
  288. for _, set := range s.unsynced {
  289. for w := range set {
  290. k := string(w.key)
  291. if w.cur > curRev {
  292. panic("watcher current revision should not exceed current revision")
  293. }
  294. if w.cur < compactionRev {
  295. select {
  296. case w.ch <- WatchResponse{WatchID: w.id, CompactRevision: compactionRev}:
  297. s.unsynced.delete(w)
  298. default:
  299. // retry next time
  300. }
  301. continue
  302. }
  303. if minRev > w.cur {
  304. minRev = w.cur
  305. }
  306. if w.prefix {
  307. prefixes[k] = struct{}{}
  308. }
  309. }
  310. }
  311. return prefixes, minRev
  312. }
  313. // kvsToEvents gets all events for the watchers from all key-value pairs
  314. func kvsToEvents(revs, vals [][]byte, wsk watcherSetByKey, pfxs map[string]struct{}) (evs []storagepb.Event) {
  315. for i, v := range vals {
  316. var kv storagepb.KeyValue
  317. if err := kv.Unmarshal(v); err != nil {
  318. log.Panicf("storage: cannot unmarshal event: %v", err)
  319. }
  320. k := string(kv.Key)
  321. if _, ok := wsk.getSetByKey(k); !ok && !matchPrefix(k, pfxs) {
  322. continue
  323. }
  324. ty := storagepb.PUT
  325. if isTombstone(revs[i]) {
  326. ty = storagepb.DELETE
  327. // patch in mod revision so watchers won't skip
  328. kv.ModRevision = bytesToRev(revs[i]).main
  329. }
  330. evs = append(evs, storagepb.Event{Kv: &kv, Type: ty})
  331. }
  332. return evs
  333. }
  334. // notify notifies the fact that given event at the given rev just happened to
  335. // watchers that watch on the key of the event.
  336. func (s *watchableStore) notify(rev int64, evs []storagepb.Event) {
  337. we := newWatcherToEventMap(s.synced, evs)
  338. for _, wm := range s.synced {
  339. for w := range wm {
  340. es, ok := we[w]
  341. if !ok {
  342. continue
  343. }
  344. select {
  345. case w.ch <- WatchResponse{WatchID: w.id, Events: es, Revision: s.Rev()}:
  346. pendingEventsGauge.Add(float64(len(es)))
  347. default:
  348. // move slow watcher to unsynced
  349. w.cur = rev
  350. s.unsynced.add(w)
  351. delete(wm, w)
  352. slowWatcherGauge.Inc()
  353. }
  354. }
  355. }
  356. }
  357. func (s *watchableStore) rev() int64 { return s.store.Rev() }
  358. type watcher struct {
  359. // the watcher key
  360. key []byte
  361. // prefix indicates if watcher is on a key or a prefix.
  362. // If prefix is true, the watcher is on a prefix.
  363. prefix bool
  364. // cur is the current watcher revision.
  365. // If cur is behind the current revision of the KV,
  366. // watcher is unsynced and needs to catch up.
  367. cur int64
  368. id WatchID
  369. // a chan to send out the watch response.
  370. // The chan might be shared with other watchers.
  371. ch chan<- WatchResponse
  372. }
  373. // newWatcherToEventMap creates a map that has watcher as key and events as
  374. // value. It enables quick events look up by watcher.
  375. func newWatcherToEventMap(sm watcherSetByKey, evs []storagepb.Event) map[*watcher][]storagepb.Event {
  376. watcherToEvents := make(map[*watcher][]storagepb.Event)
  377. for _, ev := range evs {
  378. key := string(ev.Kv.Key)
  379. // check all prefixes of the key to notify all corresponded watchers
  380. for i := 0; i <= len(key); i++ {
  381. for w := range sm[key[:i]] {
  382. // don't double notify
  383. if ev.Kv.ModRevision < w.cur {
  384. continue
  385. }
  386. // the watcher needs to be notified when either it watches prefix or
  387. // the key is exactly matched.
  388. if !w.prefix && i != len(ev.Kv.Key) {
  389. continue
  390. }
  391. watcherToEvents[w] = append(watcherToEvents[w], ev)
  392. }
  393. }
  394. }
  395. return watcherToEvents
  396. }
  397. // matchPrefix returns true if key has any matching prefix
  398. // from prefixes map.
  399. func matchPrefix(key string, prefixes map[string]struct{}) bool {
  400. for p := range prefixes {
  401. if strings.HasPrefix(key, p) {
  402. return true
  403. }
  404. }
  405. return false
  406. }