watchable_store.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "log"
  17. "math"
  18. "strings"
  19. "sync"
  20. "time"
  21. "github.com/coreos/etcd/lease"
  22. "github.com/coreos/etcd/storage/backend"
  23. "github.com/coreos/etcd/storage/storagepb"
  24. )
  25. const (
  26. // chanBufLen is the length of the buffered chan
  27. // for sending out watched events.
  28. // TODO: find a good buf value. 1024 is just a random one that
  29. // seems to be reasonable.
  30. chanBufLen = 1024
  31. )
  32. var (
  33. // watchBatchMaxRevs is the maximum distinct revisions that
  34. // may be sent to an unsynced watcher at a time. Declared as
  35. // var instead of const for testing purposes.
  36. watchBatchMaxRevs = 1000
  37. )
  38. type eventBatch struct {
  39. // evs is a batch of revision-ordered events
  40. evs []storagepb.Event
  41. // revs is the minimum unique revisions observed for this batch
  42. revs int
  43. // moreRev is first revision with more events following this batch
  44. moreRev int64
  45. }
  46. type (
  47. watcherSetByKey map[string]watcherSet
  48. watcherSet map[*watcher]struct{}
  49. watcherBatch map[*watcher]*eventBatch
  50. )
  51. func (eb *eventBatch) add(ev storagepb.Event) {
  52. if eb.revs > watchBatchMaxRevs {
  53. // maxed out batch size
  54. return
  55. }
  56. if len(eb.evs) == 0 {
  57. // base case
  58. eb.revs = 1
  59. eb.evs = append(eb.evs, ev)
  60. return
  61. }
  62. // revision accounting
  63. ebRev := eb.evs[len(eb.evs)-1].Kv.ModRevision
  64. evRev := ev.Kv.ModRevision
  65. if evRev > ebRev {
  66. eb.revs++
  67. if eb.revs > watchBatchMaxRevs {
  68. eb.moreRev = evRev
  69. return
  70. }
  71. }
  72. eb.evs = append(eb.evs, ev)
  73. }
  74. func (wb watcherBatch) add(w *watcher, ev storagepb.Event) {
  75. eb := wb[w]
  76. if eb == nil {
  77. eb = &eventBatch{}
  78. wb[w] = eb
  79. }
  80. eb.add(ev)
  81. }
  82. func (w watcherSet) add(wa *watcher) {
  83. if _, ok := w[wa]; ok {
  84. panic("add watcher twice!")
  85. }
  86. w[wa] = struct{}{}
  87. }
  88. func (w watcherSetByKey) add(wa *watcher) {
  89. set := w[string(wa.key)]
  90. if set == nil {
  91. set = make(watcherSet)
  92. w[string(wa.key)] = set
  93. }
  94. set.add(wa)
  95. }
  96. func (w watcherSetByKey) getSetByKey(key string) (watcherSet, bool) {
  97. set, ok := w[key]
  98. return set, ok
  99. }
  100. func (w watcherSetByKey) delete(wa *watcher) bool {
  101. k := string(wa.key)
  102. if v, ok := w[k]; ok {
  103. if _, ok := v[wa]; ok {
  104. delete(v, wa)
  105. // if there is nothing in the set,
  106. // remove the set
  107. if len(v) == 0 {
  108. delete(w, k)
  109. }
  110. return true
  111. }
  112. }
  113. return false
  114. }
  115. type watchable interface {
  116. watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc)
  117. rev() int64
  118. }
  119. type watchableStore struct {
  120. mu sync.Mutex
  121. *store
  122. // contains all unsynced watchers that needs to sync with events that have happened
  123. unsynced watcherSetByKey
  124. // contains all synced watchers that are in sync with the progress of the store.
  125. // The key of the map is the key that the watcher watches on.
  126. synced watcherSetByKey
  127. stopc chan struct{}
  128. wg sync.WaitGroup
  129. }
  130. // cancelFunc updates unsynced and synced maps when running
  131. // cancel operations.
  132. type cancelFunc func()
  133. func newWatchableStore(b backend.Backend, le lease.Lessor) *watchableStore {
  134. s := &watchableStore{
  135. store: NewStore(b, le),
  136. unsynced: make(watcherSetByKey),
  137. synced: make(watcherSetByKey),
  138. stopc: make(chan struct{}),
  139. }
  140. if s.le != nil {
  141. // use this store as the deleter so revokes trigger watch events
  142. s.le.SetRangeDeleter(s)
  143. }
  144. s.wg.Add(1)
  145. go s.syncWatchersLoop()
  146. return s
  147. }
  148. func (s *watchableStore) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
  149. s.mu.Lock()
  150. defer s.mu.Unlock()
  151. rev = s.store.Put(key, value, lease)
  152. changes := s.store.getChanges()
  153. if len(changes) != 1 {
  154. log.Panicf("unexpected len(changes) != 1 after put")
  155. }
  156. ev := storagepb.Event{
  157. Type: storagepb.PUT,
  158. Kv: &changes[0],
  159. }
  160. s.notify(rev, []storagepb.Event{ev})
  161. return rev
  162. }
  163. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  164. s.mu.Lock()
  165. defer s.mu.Unlock()
  166. n, rev = s.store.DeleteRange(key, end)
  167. changes := s.store.getChanges()
  168. if len(changes) != int(n) {
  169. log.Panicf("unexpected len(changes) != n after deleteRange")
  170. }
  171. if n == 0 {
  172. return n, rev
  173. }
  174. evs := make([]storagepb.Event, n)
  175. for i, change := range changes {
  176. evs[i] = storagepb.Event{
  177. Type: storagepb.DELETE,
  178. Kv: &change}
  179. evs[i].Kv.ModRevision = rev
  180. }
  181. s.notify(rev, evs)
  182. return n, rev
  183. }
  184. func (s *watchableStore) TxnBegin() int64 {
  185. s.mu.Lock()
  186. return s.store.TxnBegin()
  187. }
  188. func (s *watchableStore) TxnEnd(txnID int64) error {
  189. err := s.store.TxnEnd(txnID)
  190. if err != nil {
  191. return err
  192. }
  193. changes := s.getChanges()
  194. if len(changes) == 0 {
  195. s.mu.Unlock()
  196. return nil
  197. }
  198. rev := s.store.Rev()
  199. evs := make([]storagepb.Event, len(changes))
  200. for i, change := range changes {
  201. switch change.Value {
  202. case nil:
  203. evs[i] = storagepb.Event{
  204. Type: storagepb.DELETE,
  205. Kv: &changes[i]}
  206. evs[i].Kv.ModRevision = rev
  207. default:
  208. evs[i] = storagepb.Event{
  209. Type: storagepb.PUT,
  210. Kv: &changes[i]}
  211. }
  212. }
  213. s.notify(rev, evs)
  214. s.mu.Unlock()
  215. return nil
  216. }
  217. func (s *watchableStore) Close() error {
  218. close(s.stopc)
  219. s.wg.Wait()
  220. return s.store.Close()
  221. }
  222. func (s *watchableStore) NewWatchStream() WatchStream {
  223. watchStreamGauge.Inc()
  224. return &watchStream{
  225. watchable: s,
  226. ch: make(chan WatchResponse, chanBufLen),
  227. cancels: make(map[WatchID]cancelFunc),
  228. }
  229. }
  230. func (s *watchableStore) watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) {
  231. s.mu.Lock()
  232. defer s.mu.Unlock()
  233. wa := &watcher{
  234. key: key,
  235. prefix: prefix,
  236. cur: startRev,
  237. id: id,
  238. ch: ch,
  239. }
  240. s.store.mu.Lock()
  241. synced := startRev > s.store.currentRev.main || startRev == 0
  242. if synced {
  243. wa.cur = s.store.currentRev.main + 1
  244. }
  245. s.store.mu.Unlock()
  246. if synced {
  247. if startRev > wa.cur {
  248. panic("can't watch past sync revision")
  249. }
  250. s.synced.add(wa)
  251. } else {
  252. slowWatcherGauge.Inc()
  253. s.unsynced.add(wa)
  254. }
  255. watcherGauge.Inc()
  256. cancel := cancelFunc(func() {
  257. s.mu.Lock()
  258. defer s.mu.Unlock()
  259. // remove references of the watcher
  260. if s.unsynced.delete(wa) {
  261. slowWatcherGauge.Dec()
  262. watcherGauge.Dec()
  263. return
  264. }
  265. if s.synced.delete(wa) {
  266. watcherGauge.Dec()
  267. }
  268. // If we cannot find it, it should have finished watch.
  269. })
  270. return wa, cancel
  271. }
  272. // syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
  273. func (s *watchableStore) syncWatchersLoop() {
  274. defer s.wg.Done()
  275. for {
  276. s.mu.Lock()
  277. s.syncWatchers()
  278. s.mu.Unlock()
  279. select {
  280. case <-time.After(100 * time.Millisecond):
  281. case <-s.stopc:
  282. return
  283. }
  284. }
  285. }
  286. // syncWatchers periodically syncs unsynced watchers by: Iterate all unsynced
  287. // watchers to get the minimum revision within its range, skipping the
  288. // watcher if its current revision is behind the compact revision of the
  289. // store. And use this minimum revision to get all key-value pairs. Then send
  290. // those events to watchers.
  291. func (s *watchableStore) syncWatchers() {
  292. s.store.mu.Lock()
  293. defer s.store.mu.Unlock()
  294. if len(s.unsynced) == 0 {
  295. return
  296. }
  297. // in order to find key-value pairs from unsynced watchers, we need to
  298. // find min revision index, and these revisions can be used to
  299. // query the backend store of key-value pairs
  300. prefixes, minRev := s.scanUnsync()
  301. curRev := s.store.currentRev.main
  302. minBytes, maxBytes := newRevBytes(), newRevBytes()
  303. revToBytes(revision{main: minRev}, minBytes)
  304. revToBytes(revision{main: curRev + 1}, maxBytes)
  305. // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
  306. // values are actual key-value pairs in backend.
  307. tx := s.store.b.BatchTx()
  308. tx.Lock()
  309. revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
  310. evs := kvsToEvents(revs, vs, s.unsynced, prefixes)
  311. tx.Unlock()
  312. for w, eb := range newWatcherBatch(s.unsynced, evs) {
  313. select {
  314. // s.store.Rev also uses Lock, so just return directly
  315. case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: s.store.currentRev.main}:
  316. pendingEventsGauge.Add(float64(len(eb.evs)))
  317. default:
  318. // TODO: handle the full unsynced watchers.
  319. // continue to process other watchers for now, the full ones
  320. // will be processed next time and hopefully it will not be full.
  321. continue
  322. }
  323. if eb.moreRev != 0 {
  324. w.cur = eb.moreRev
  325. continue
  326. }
  327. w.cur = curRev
  328. s.synced.add(w)
  329. s.unsynced.delete(w)
  330. }
  331. slowWatcherGauge.Set(float64(len(s.unsynced)))
  332. }
  333. func (s *watchableStore) scanUnsync() (prefixes map[string]struct{}, minRev int64) {
  334. curRev := s.store.currentRev.main
  335. compactionRev := s.store.compactMainRev
  336. prefixes = make(map[string]struct{})
  337. minRev = int64(math.MaxInt64)
  338. for _, set := range s.unsynced {
  339. for w := range set {
  340. k := string(w.key)
  341. if w.cur > curRev {
  342. panic("watcher current revision should not exceed current revision")
  343. }
  344. if w.cur < compactionRev {
  345. select {
  346. case w.ch <- WatchResponse{WatchID: w.id, CompactRevision: compactionRev}:
  347. s.unsynced.delete(w)
  348. default:
  349. // retry next time
  350. }
  351. continue
  352. }
  353. if minRev > w.cur {
  354. minRev = w.cur
  355. }
  356. if w.prefix {
  357. prefixes[k] = struct{}{}
  358. }
  359. }
  360. }
  361. return prefixes, minRev
  362. }
  363. // kvsToEvents gets all events for the watchers from all key-value pairs
  364. func kvsToEvents(revs, vals [][]byte, wsk watcherSetByKey, pfxs map[string]struct{}) (evs []storagepb.Event) {
  365. for i, v := range vals {
  366. var kv storagepb.KeyValue
  367. if err := kv.Unmarshal(v); err != nil {
  368. log.Panicf("storage: cannot unmarshal event: %v", err)
  369. }
  370. k := string(kv.Key)
  371. if _, ok := wsk.getSetByKey(k); !ok && !matchPrefix(k, pfxs) {
  372. continue
  373. }
  374. ty := storagepb.PUT
  375. if isTombstone(revs[i]) {
  376. ty = storagepb.DELETE
  377. // patch in mod revision so watchers won't skip
  378. kv.ModRevision = bytesToRev(revs[i]).main
  379. }
  380. evs = append(evs, storagepb.Event{Kv: &kv, Type: ty})
  381. }
  382. return evs
  383. }
  384. // notify notifies the fact that given event at the given rev just happened to
  385. // watchers that watch on the key of the event.
  386. func (s *watchableStore) notify(rev int64, evs []storagepb.Event) {
  387. we := newWatcherBatch(s.synced, evs)
  388. for _, wm := range s.synced {
  389. for w := range wm {
  390. eb, ok := we[w]
  391. if !ok {
  392. continue
  393. }
  394. if eb.revs != 1 {
  395. panic("unexpected multiple revisions in notification")
  396. }
  397. select {
  398. case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: s.Rev()}:
  399. pendingEventsGauge.Add(float64(len(eb.evs)))
  400. default:
  401. // move slow watcher to unsynced
  402. w.cur = rev
  403. s.unsynced.add(w)
  404. delete(wm, w)
  405. slowWatcherGauge.Inc()
  406. }
  407. }
  408. }
  409. }
  410. func (s *watchableStore) rev() int64 { return s.store.Rev() }
  411. type watcher struct {
  412. // the watcher key
  413. key []byte
  414. // prefix indicates if watcher is on a key or a prefix.
  415. // If prefix is true, the watcher is on a prefix.
  416. prefix bool
  417. // cur is the current watcher revision.
  418. // If cur is behind the current revision of the KV,
  419. // watcher is unsynced and needs to catch up.
  420. cur int64
  421. id WatchID
  422. // a chan to send out the watch response.
  423. // The chan might be shared with other watchers.
  424. ch chan<- WatchResponse
  425. }
  426. // newWatcherBatch maps watchers to their matched events. It enables quick
  427. // events look up by watcher.
  428. func newWatcherBatch(sm watcherSetByKey, evs []storagepb.Event) watcherBatch {
  429. wb := make(watcherBatch)
  430. for _, ev := range evs {
  431. key := string(ev.Kv.Key)
  432. // check all prefixes of the key to notify all corresponded watchers
  433. for i := 0; i <= len(key); i++ {
  434. for w := range sm[key[:i]] {
  435. // don't double notify
  436. if ev.Kv.ModRevision < w.cur {
  437. continue
  438. }
  439. // the watcher needs to be notified when either it watches prefix or
  440. // the key is exactly matched.
  441. if !w.prefix && i != len(ev.Kv.Key) {
  442. continue
  443. }
  444. wb.add(w, ev)
  445. }
  446. }
  447. }
  448. return wb
  449. }
  450. // matchPrefix returns true if key has any matching prefix
  451. // from prefixes map.
  452. func matchPrefix(key string, prefixes map[string]struct{}) bool {
  453. for p := range prefixes {
  454. if strings.HasPrefix(key, p) {
  455. return true
  456. }
  457. }
  458. return false
  459. }