watchable_store.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "log"
  17. "math"
  18. "strings"
  19. "sync"
  20. "time"
  21. "github.com/coreos/etcd/lease"
  22. "github.com/coreos/etcd/storage/backend"
  23. "github.com/coreos/etcd/storage/storagepb"
  24. )
  25. const (
  26. // chanBufLen is the length of the buffered chan
  27. // for sending out watched events.
  28. // TODO: find a good buf value. 1024 is just a random one that
  29. // seems to be reasonable.
  30. chanBufLen = 1024
  31. )
  32. type (
  33. watcherSetByKey map[string]watcherSet
  34. watcherSet map[*watcher]struct{}
  35. )
  36. func (w watcherSet) add(wa *watcher) {
  37. if _, ok := w[wa]; ok {
  38. panic("add watcher twice!")
  39. }
  40. w[wa] = struct{}{}
  41. }
  42. func (w watcherSetByKey) add(wa *watcher) {
  43. set := w[string(wa.key)]
  44. if set == nil {
  45. set = make(watcherSet)
  46. w[string(wa.key)] = set
  47. }
  48. set.add(wa)
  49. }
  50. func (w watcherSetByKey) delete(wa *watcher) bool {
  51. k := string(wa.key)
  52. if v, ok := w[k]; ok {
  53. if _, ok := v[wa]; ok {
  54. delete(v, wa)
  55. // if there is nothing in the set,
  56. // remove the set
  57. if len(v) == 0 {
  58. delete(w, k)
  59. }
  60. return true
  61. }
  62. }
  63. return false
  64. }
  65. type watchable interface {
  66. watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc)
  67. rev() int64
  68. }
  69. type watchableStore struct {
  70. mu sync.Mutex
  71. *store
  72. // contains all unsynced watchers that needs to sync with events that have happened
  73. unsynced watcherSet
  74. // contains all synced watchers that are in sync with the progress of the store.
  75. // The key of the map is the key that the watcher watches on.
  76. synced watcherSetByKey
  77. stopc chan struct{}
  78. wg sync.WaitGroup
  79. }
  80. // cancelFunc updates unsynced and synced maps when running
  81. // cancel operations.
  82. type cancelFunc func()
  83. func newWatchableStore(b backend.Backend, le lease.Lessor) *watchableStore {
  84. s := &watchableStore{
  85. store: NewStore(b, le),
  86. unsynced: make(watcherSet),
  87. synced: make(watcherSetByKey),
  88. stopc: make(chan struct{}),
  89. }
  90. if s.le != nil {
  91. // use this store as the deleter so revokes trigger watch events
  92. s.le.SetRangeDeleter(s)
  93. }
  94. s.wg.Add(1)
  95. go s.syncWatchersLoop()
  96. return s
  97. }
  98. func (s *watchableStore) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
  99. s.mu.Lock()
  100. defer s.mu.Unlock()
  101. rev = s.store.Put(key, value, lease)
  102. changes := s.store.getChanges()
  103. if len(changes) != 1 {
  104. log.Panicf("unexpected len(changes) != 1 after put")
  105. }
  106. ev := storagepb.Event{
  107. Type: storagepb.PUT,
  108. Kv: &changes[0],
  109. }
  110. s.handle(rev, []storagepb.Event{ev})
  111. return rev
  112. }
  113. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  114. s.mu.Lock()
  115. defer s.mu.Unlock()
  116. n, rev = s.store.DeleteRange(key, end)
  117. changes := s.store.getChanges()
  118. if len(changes) != int(n) {
  119. log.Panicf("unexpected len(changes) != n after deleteRange")
  120. }
  121. if n == 0 {
  122. return n, rev
  123. }
  124. evs := make([]storagepb.Event, n)
  125. for i, change := range changes {
  126. evs[i] = storagepb.Event{
  127. Type: storagepb.DELETE,
  128. Kv: &change}
  129. }
  130. s.handle(rev, evs)
  131. return n, rev
  132. }
  133. func (s *watchableStore) TxnBegin() int64 {
  134. s.mu.Lock()
  135. return s.store.TxnBegin()
  136. }
  137. func (s *watchableStore) TxnEnd(txnID int64) error {
  138. err := s.store.TxnEnd(txnID)
  139. if err != nil {
  140. return err
  141. }
  142. changes := s.getChanges()
  143. if len(changes) == 0 {
  144. s.mu.Unlock()
  145. return nil
  146. }
  147. evs := make([]storagepb.Event, len(changes))
  148. for i, change := range changes {
  149. switch change.Value {
  150. case nil:
  151. evs[i] = storagepb.Event{
  152. Type: storagepb.DELETE,
  153. Kv: &changes[i]}
  154. default:
  155. evs[i] = storagepb.Event{
  156. Type: storagepb.PUT,
  157. Kv: &changes[i]}
  158. }
  159. }
  160. s.handle(s.store.Rev(), evs)
  161. s.mu.Unlock()
  162. return nil
  163. }
  164. func (s *watchableStore) Close() error {
  165. close(s.stopc)
  166. s.wg.Wait()
  167. return s.store.Close()
  168. }
  169. func (s *watchableStore) NewWatchStream() WatchStream {
  170. watchStreamGauge.Inc()
  171. return &watchStream{
  172. watchable: s,
  173. ch: make(chan WatchResponse, chanBufLen),
  174. cancels: make(map[WatchID]cancelFunc),
  175. }
  176. }
  177. func (s *watchableStore) watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) {
  178. s.mu.Lock()
  179. defer s.mu.Unlock()
  180. wa := &watcher{
  181. key: key,
  182. prefix: prefix,
  183. cur: startRev,
  184. id: id,
  185. ch: ch,
  186. }
  187. if startRev == 0 {
  188. s.synced.add(wa)
  189. } else {
  190. slowWatcherGauge.Inc()
  191. s.unsynced[wa] = struct{}{}
  192. }
  193. watcherGauge.Inc()
  194. cancel := cancelFunc(func() {
  195. s.mu.Lock()
  196. defer s.mu.Unlock()
  197. // remove global references of the watcher
  198. if _, ok := s.unsynced[wa]; ok {
  199. delete(s.unsynced, wa)
  200. slowWatcherGauge.Dec()
  201. watcherGauge.Dec()
  202. return
  203. }
  204. if s.synced.delete(wa) {
  205. watcherGauge.Dec()
  206. }
  207. // If we cannot find it, it should have finished watch.
  208. })
  209. return wa, cancel
  210. }
  211. // syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
  212. func (s *watchableStore) syncWatchersLoop() {
  213. defer s.wg.Done()
  214. for {
  215. s.mu.Lock()
  216. s.syncWatchers()
  217. s.mu.Unlock()
  218. select {
  219. case <-time.After(100 * time.Millisecond):
  220. case <-s.stopc:
  221. return
  222. }
  223. }
  224. }
  225. // syncWatchers periodically syncs unsynced watchers by: Iterate all unsynced
  226. // watchers to get the minimum revision within its range, skipping the
  227. // watcher if its current revision is behind the compact revision of the
  228. // store. And use this minimum revision to get all key-value pairs. Then send
  229. // those events to watchers.
  230. func (s *watchableStore) syncWatchers() {
  231. s.store.mu.Lock()
  232. defer s.store.mu.Unlock()
  233. if len(s.unsynced) == 0 {
  234. return
  235. }
  236. // in order to find key-value pairs from unsynced watchers, we need to
  237. // find min revision index, and these revisions can be used to
  238. // query the backend store of key-value pairs
  239. minRev := int64(math.MaxInt64)
  240. curRev := s.store.currentRev.main
  241. compactionRev := s.store.compactMainRev
  242. // TODO: change unsynced struct type same to this
  243. keyToUnsynced := make(watcherSetByKey)
  244. prefixes := make(map[string]struct{})
  245. for w := range s.unsynced {
  246. k := string(w.key)
  247. if w.cur > curRev {
  248. panic("watcher current revision should not exceed current revision")
  249. }
  250. if w.cur < compactionRev {
  251. // TODO: return error compacted to that watcher instead of
  252. // just removing it silently from unsynced.
  253. delete(s.unsynced, w)
  254. continue
  255. }
  256. if minRev >= w.cur {
  257. minRev = w.cur
  258. }
  259. keyToUnsynced.add(w)
  260. if w.prefix {
  261. prefixes[k] = struct{}{}
  262. }
  263. }
  264. minBytes, maxBytes := newRevBytes(), newRevBytes()
  265. revToBytes(revision{main: minRev}, minBytes)
  266. revToBytes(revision{main: curRev + 1}, maxBytes)
  267. // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
  268. // values are actual key-value pairs in backend.
  269. tx := s.store.b.BatchTx()
  270. tx.Lock()
  271. ks, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
  272. tx.Unlock()
  273. evs := []storagepb.Event{}
  274. // get the list of all events from all key-value pairs
  275. for i, v := range vs {
  276. var kv storagepb.KeyValue
  277. if err := kv.Unmarshal(v); err != nil {
  278. log.Panicf("storage: cannot unmarshal event: %v", err)
  279. }
  280. k := string(kv.Key)
  281. if _, ok := keyToUnsynced[k]; !ok && !matchPrefix(k, prefixes) {
  282. continue
  283. }
  284. var ev storagepb.Event
  285. switch {
  286. case isTombstone(ks[i]):
  287. ev.Type = storagepb.DELETE
  288. default:
  289. ev.Type = storagepb.PUT
  290. }
  291. ev.Kv = &kv
  292. evs = append(evs, ev)
  293. }
  294. for w, es := range newWatcherToEventMap(keyToUnsynced, evs) {
  295. select {
  296. // s.store.Rev also uses Lock, so just return directly
  297. case w.ch <- WatchResponse{WatchID: w.id, Events: es, Revision: s.store.currentRev.main}:
  298. pendingEventsGauge.Add(float64(len(es)))
  299. default:
  300. // TODO: handle the full unsynced watchers.
  301. // continue to process other watchers for now, the full ones
  302. // will be processed next time and hopefully it will not be full.
  303. continue
  304. }
  305. s.synced.add(w)
  306. delete(s.unsynced, w)
  307. }
  308. slowWatcherGauge.Set(float64(len(s.unsynced)))
  309. }
  310. // handle handles the change of the happening event on all watchers.
  311. func (s *watchableStore) handle(rev int64, evs []storagepb.Event) {
  312. s.notify(rev, evs)
  313. }
  314. // notify notifies the fact that given event at the given rev just happened to
  315. // watchers that watch on the key of the event.
  316. func (s *watchableStore) notify(rev int64, evs []storagepb.Event) {
  317. we := newWatcherToEventMap(s.synced, evs)
  318. for _, wm := range s.synced {
  319. for w := range wm {
  320. es, ok := we[w]
  321. if !ok {
  322. continue
  323. }
  324. select {
  325. case w.ch <- WatchResponse{WatchID: w.id, Events: es, Revision: s.Rev()}:
  326. pendingEventsGauge.Add(float64(len(es)))
  327. default:
  328. // move slow watcher to unsynced
  329. w.cur = rev
  330. s.unsynced[w] = struct{}{}
  331. delete(wm, w)
  332. slowWatcherGauge.Inc()
  333. }
  334. }
  335. }
  336. }
  337. func (s *watchableStore) rev() int64 { return s.store.Rev() }
  338. type watcher struct {
  339. // the watcher key
  340. key []byte
  341. // prefix indicates if watcher is on a key or a prefix.
  342. // If prefix is true, the watcher is on a prefix.
  343. prefix bool
  344. // cur is the current watcher revision.
  345. // If cur is behind the current revision of the KV,
  346. // watcher is unsynced and needs to catch up.
  347. cur int64
  348. id WatchID
  349. // a chan to send out the watch response.
  350. // The chan might be shared with other watchers.
  351. ch chan<- WatchResponse
  352. }
  353. // newWatcherToEventMap creates a map that has watcher as key and events as
  354. // value. It enables quick events look up by watcher.
  355. func newWatcherToEventMap(sm watcherSetByKey, evs []storagepb.Event) map[*watcher][]storagepb.Event {
  356. watcherToEvents := make(map[*watcher][]storagepb.Event)
  357. for _, ev := range evs {
  358. key := string(ev.Kv.Key)
  359. // check all prefixes of the key to notify all corresponded watchers
  360. for i := 0; i <= len(key); i++ {
  361. k := string(key[:i])
  362. wm, ok := sm[k]
  363. if !ok {
  364. continue
  365. }
  366. for w := range wm {
  367. // the watcher needs to be notified when either it watches prefix or
  368. // the key is exactly matched.
  369. if !w.prefix && i != len(ev.Kv.Key) {
  370. continue
  371. }
  372. if _, ok := watcherToEvents[w]; !ok {
  373. watcherToEvents[w] = []storagepb.Event{}
  374. }
  375. watcherToEvents[w] = append(watcherToEvents[w], ev)
  376. }
  377. }
  378. }
  379. return watcherToEvents
  380. }
  381. // matchPrefix returns true if key has any matching prefix
  382. // from prefixes map.
  383. func matchPrefix(key string, prefixes map[string]struct{}) bool {
  384. for p := range prefixes {
  385. if strings.HasPrefix(key, p) {
  386. return true
  387. }
  388. }
  389. return false
  390. }