watchable_store.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "fmt"
  17. "log"
  18. "math"
  19. "sync"
  20. "time"
  21. "github.com/coreos/etcd/storage/storagepb"
  22. )
  23. const (
  24. // chanBufLen is the length of the buffered chan
  25. // for sending out watched events.
  26. // TODO: find a good buf value. 1024 is just a random one that
  27. // seems to be reasonable.
  28. chanBufLen = 1024
  29. )
  30. type watchable interface {
  31. watch(key []byte, prefix bool, startRev, id int64, ch chan<- WatchResponse) (*watcher, CancelFunc)
  32. }
  33. type watchableStore struct {
  34. mu sync.Mutex
  35. *store
  36. // contains all unsynced watchers that needs to sync with events that have happened
  37. unsynced map[*watcher]struct{}
  38. // contains all synced watchers that are in sync with the progress of the store.
  39. // The key of the map is the key that the watcher watches on.
  40. synced map[string]map[*watcher]struct{}
  41. tx *ongoingTx
  42. stopc chan struct{}
  43. wg sync.WaitGroup
  44. }
  45. func newWatchableStore(path string) *watchableStore {
  46. s := &watchableStore{
  47. store: newDefaultStore(path),
  48. unsynced: make(map[*watcher]struct{}),
  49. synced: make(map[string]map[*watcher]struct{}),
  50. stopc: make(chan struct{}),
  51. }
  52. s.wg.Add(1)
  53. go s.syncWatchersLoop()
  54. return s
  55. }
  56. func (s *watchableStore) Put(key, value []byte) (rev int64) {
  57. s.mu.Lock()
  58. defer s.mu.Unlock()
  59. rev = s.store.Put(key, value)
  60. // TODO: avoid this range
  61. kvs, _, err := s.store.Range(key, nil, 0, rev)
  62. if err != nil {
  63. log.Panicf("unexpected range error (%v)", err)
  64. }
  65. ev := storagepb.Event{
  66. Type: storagepb.PUT,
  67. Kv: &kvs[0],
  68. }
  69. s.handle(rev, []storagepb.Event{ev})
  70. return rev
  71. }
  72. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  73. s.mu.Lock()
  74. defer s.mu.Unlock()
  75. // TODO: avoid this range
  76. kvs, _, err := s.store.Range(key, end, 0, 0)
  77. if err != nil {
  78. log.Panicf("unexpected range error (%v)", err)
  79. }
  80. n, rev = s.store.DeleteRange(key, end)
  81. evs := make([]storagepb.Event, len(kvs))
  82. for i, kv := range kvs {
  83. evs[i] = storagepb.Event{
  84. Type: storagepb.DELETE,
  85. Kv: &storagepb.KeyValue{
  86. Key: kv.Key,
  87. }}
  88. }
  89. s.handle(rev, evs)
  90. return n, rev
  91. }
  92. func (s *watchableStore) TxnBegin() int64 {
  93. s.mu.Lock()
  94. s.tx = newOngoingTx()
  95. return s.store.TxnBegin()
  96. }
  97. func (s *watchableStore) TxnPut(txnID int64, key, value []byte) (rev int64, err error) {
  98. rev, err = s.store.TxnPut(txnID, key, value)
  99. if err == nil {
  100. s.tx.put(string(key))
  101. }
  102. return rev, err
  103. }
  104. func (s *watchableStore) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
  105. kvs, _, err := s.store.TxnRange(txnID, key, end, 0, 0)
  106. if err != nil {
  107. log.Panicf("unexpected range error (%v)", err)
  108. }
  109. n, rev, err = s.store.TxnDeleteRange(txnID, key, end)
  110. if err == nil {
  111. for _, kv := range kvs {
  112. s.tx.del(string(kv.Key))
  113. }
  114. }
  115. return n, rev, err
  116. }
  117. func (s *watchableStore) TxnEnd(txnID int64) error {
  118. err := s.store.TxnEnd(txnID)
  119. if err != nil {
  120. return err
  121. }
  122. _, rev, _ := s.store.Range(nil, nil, 0, 0)
  123. evs := []storagepb.Event{}
  124. for k := range s.tx.putm {
  125. kvs, _, err := s.store.Range([]byte(k), nil, 0, 0)
  126. if err != nil {
  127. log.Panicf("unexpected range error (%v)", err)
  128. }
  129. ev := storagepb.Event{
  130. Type: storagepb.PUT,
  131. Kv: &kvs[0],
  132. }
  133. evs = append(evs, ev)
  134. }
  135. for k := range s.tx.delm {
  136. ev := storagepb.Event{
  137. Type: storagepb.DELETE,
  138. Kv: &storagepb.KeyValue{
  139. Key: []byte(k),
  140. },
  141. }
  142. evs = append(evs, ev)
  143. }
  144. s.handle(rev, evs)
  145. s.mu.Unlock()
  146. return nil
  147. }
  148. func (s *watchableStore) Close() error {
  149. close(s.stopc)
  150. s.wg.Wait()
  151. return s.store.Close()
  152. }
  153. func (s *watchableStore) NewWatchStream() WatchStream {
  154. watchStreamGauge.Inc()
  155. return &watchStream{
  156. watchable: s,
  157. ch: make(chan WatchResponse, chanBufLen),
  158. cancels: make(map[int64]CancelFunc),
  159. }
  160. }
  161. func (s *watchableStore) watch(key []byte, prefix bool, startRev, id int64, ch chan<- WatchResponse) (*watcher, CancelFunc) {
  162. s.mu.Lock()
  163. defer s.mu.Unlock()
  164. wa := &watcher{
  165. key: key,
  166. prefix: prefix,
  167. cur: startRev,
  168. id: id,
  169. ch: ch,
  170. }
  171. k := string(key)
  172. if startRev == 0 {
  173. if err := unsafeAddWatcher(&s.synced, k, wa); err != nil {
  174. log.Panicf("error unsafeAddWatcher (%v) for key %s", err, k)
  175. }
  176. } else {
  177. slowWatcherGauge.Inc()
  178. s.unsynced[wa] = struct{}{}
  179. }
  180. watcherGauge.Inc()
  181. cancel := CancelFunc(func() {
  182. s.mu.Lock()
  183. defer s.mu.Unlock()
  184. // remove global references of the watcher
  185. if _, ok := s.unsynced[wa]; ok {
  186. delete(s.unsynced, wa)
  187. slowWatcherGauge.Dec()
  188. watcherGauge.Dec()
  189. return
  190. }
  191. if v, ok := s.synced[k]; ok {
  192. if _, ok := v[wa]; ok {
  193. delete(v, wa)
  194. // if there is nothing in s.synced[k],
  195. // remove the key from the synced
  196. if len(v) == 0 {
  197. delete(s.synced, k)
  198. }
  199. watcherGauge.Dec()
  200. }
  201. }
  202. // If we cannot find it, it should have finished watch.
  203. })
  204. return wa, cancel
  205. }
  206. // syncWatchersLoop syncs the watcher in the unsyncd map every 100ms.
  207. func (s *watchableStore) syncWatchersLoop() {
  208. defer s.wg.Done()
  209. for {
  210. s.mu.Lock()
  211. s.syncWatchers()
  212. s.mu.Unlock()
  213. select {
  214. case <-time.After(100 * time.Millisecond):
  215. case <-s.stopc:
  216. return
  217. }
  218. }
  219. }
  220. // syncWatchers periodically syncs unsynced watchers by: Iterate all unsynced
  221. // watchers to get the minimum revision within its range, skipping the
  222. // watcher if its current revision is behind the compact revision of the
  223. // store. And use this minimum revision to get all key-value pairs. Then send
  224. // those events to watchers.
  225. func (s *watchableStore) syncWatchers() {
  226. s.store.mu.Lock()
  227. defer s.store.mu.Unlock()
  228. if len(s.unsynced) == 0 {
  229. return
  230. }
  231. // in order to find key-value pairs from unsynced watchers, we need to
  232. // find min revision index, and these revisions can be used to
  233. // query the backend store of key-value pairs
  234. minRev := int64(math.MaxInt64)
  235. curRev := s.store.currentRev.main
  236. compactionRev := s.store.compactMainRev
  237. // TODO: change unsynced struct type same to this
  238. keyToUnsynced := make(map[string]map[*watcher]struct{})
  239. for w := range s.unsynced {
  240. k := string(w.key)
  241. if w.cur > curRev {
  242. panic("watcher current revision should not exceed current revision")
  243. }
  244. if w.cur < compactionRev {
  245. // TODO: return error compacted to that watcher instead of
  246. // just removing it sliently from unsynced.
  247. delete(s.unsynced, w)
  248. continue
  249. }
  250. if minRev >= w.cur {
  251. minRev = w.cur
  252. }
  253. if _, ok := keyToUnsynced[k]; !ok {
  254. keyToUnsynced[k] = make(map[*watcher]struct{})
  255. }
  256. keyToUnsynced[k][w] = struct{}{}
  257. }
  258. minBytes, maxBytes := newRevBytes(), newRevBytes()
  259. revToBytes(revision{main: minRev}, minBytes)
  260. revToBytes(revision{main: curRev + 1}, maxBytes)
  261. // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
  262. // values are actual key-value pairs in backend.
  263. tx := s.store.b.BatchTx()
  264. tx.Lock()
  265. ks, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
  266. tx.Unlock()
  267. evs := []storagepb.Event{}
  268. // get the list of all events from all key-value pairs
  269. for i, v := range vs {
  270. var kv storagepb.KeyValue
  271. if err := kv.Unmarshal(v); err != nil {
  272. log.Panicf("storage: cannot unmarshal event: %v", err)
  273. }
  274. k := string(kv.Key)
  275. if _, ok := keyToUnsynced[k]; !ok {
  276. continue
  277. }
  278. var ev storagepb.Event
  279. switch {
  280. case isTombstone(ks[i]):
  281. ev.Type = storagepb.DELETE
  282. default:
  283. ev.Type = storagepb.PUT
  284. }
  285. ev.Kv = &kv
  286. evs = append(evs, ev)
  287. }
  288. for w, es := range newWatcherToEventMap(keyToUnsynced, evs) {
  289. wr := WatchResponse{WatchID: w.id, Events: es}
  290. select {
  291. case w.ch <- wr:
  292. pendingEventsGauge.Add(float64(len(es)))
  293. default:
  294. // TODO: handle the full unsynced watchers.
  295. // continue to process other watchers for now, the full ones
  296. // will be processed next time and hopefully it will not be full.
  297. continue
  298. }
  299. k := string(w.key)
  300. if err := unsafeAddWatcher(&s.synced, k, w); err != nil {
  301. log.Panicf("error unsafeAddWatcher (%v) for key %s", err, k)
  302. }
  303. delete(s.unsynced, w)
  304. }
  305. slowWatcherGauge.Set(float64(len(s.unsynced)))
  306. }
  307. // handle handles the change of the happening event on all watchers.
  308. func (s *watchableStore) handle(rev int64, evs []storagepb.Event) {
  309. s.notify(rev, evs)
  310. }
  311. // notify notifies the fact that given event at the given rev just happened to
  312. // watchers that watch on the key of the event.
  313. func (s *watchableStore) notify(rev int64, evs []storagepb.Event) {
  314. we := newWatcherToEventMap(s.synced, evs)
  315. for _, wm := range s.synced {
  316. for w := range wm {
  317. if _, ok := we[w]; !ok {
  318. continue
  319. }
  320. es := we[w]
  321. wr := WatchResponse{WatchID: w.id, Events: es}
  322. select {
  323. case w.ch <- wr:
  324. pendingEventsGauge.Add(float64(len(es)))
  325. default:
  326. // move slow watcher to unsynced
  327. w.cur = rev
  328. s.unsynced[w] = struct{}{}
  329. delete(wm, w)
  330. slowWatcherGauge.Inc()
  331. }
  332. }
  333. }
  334. }
  335. type ongoingTx struct {
  336. // keys put/deleted in the ongoing txn
  337. putm map[string]struct{}
  338. delm map[string]struct{}
  339. }
  340. func newOngoingTx() *ongoingTx {
  341. return &ongoingTx{
  342. putm: make(map[string]struct{}),
  343. delm: make(map[string]struct{}),
  344. }
  345. }
  346. func (tx *ongoingTx) put(k string) {
  347. tx.putm[k] = struct{}{}
  348. if _, ok := tx.delm[k]; ok {
  349. delete(tx.delm, k)
  350. }
  351. }
  352. func (tx *ongoingTx) del(k string) {
  353. tx.delm[k] = struct{}{}
  354. if _, ok := tx.putm[k]; ok {
  355. delete(tx.putm, k)
  356. }
  357. }
  358. type watcher struct {
  359. // the watcher key
  360. key []byte
  361. // prefix indicates if watcher is on a key or a prefix.
  362. // If prefix is true, the watcher is on a prefix.
  363. prefix bool
  364. // cur is the current watcher revision.
  365. // If cur is behind the current revision of the KV,
  366. // watcher is unsynced and needs to catch up.
  367. cur int64
  368. id int64
  369. // a chan to send out the watch response.
  370. // The chan might be shared with other watchers.
  371. ch chan<- WatchResponse
  372. }
  373. // unsafeAddWatcher puts watcher with key k into watchableStore's synced.
  374. // Make sure to this is thread-safe using mutex before and after.
  375. func unsafeAddWatcher(synced *map[string]map[*watcher]struct{}, k string, wa *watcher) error {
  376. if wa == nil {
  377. return fmt.Errorf("nil watcher received")
  378. }
  379. mp := *synced
  380. if v, ok := mp[k]; ok {
  381. if _, ok := v[wa]; ok {
  382. return fmt.Errorf("put the same watcher twice: %+v", wa)
  383. } else {
  384. v[wa] = struct{}{}
  385. }
  386. return nil
  387. }
  388. mp[k] = make(map[*watcher]struct{})
  389. mp[k][wa] = struct{}{}
  390. return nil
  391. }
  392. // newWatcherToEventMap creates a map that has watcher as key and events as
  393. // value. It enables quick events look up by watcher.
  394. func newWatcherToEventMap(sm map[string]map[*watcher]struct{}, evs []storagepb.Event) map[*watcher][]storagepb.Event {
  395. watcherToEvents := make(map[*watcher][]storagepb.Event)
  396. for _, ev := range evs {
  397. key := string(ev.Kv.Key)
  398. // check all prefixes of the key to notify all corresponded watchers
  399. for i := 0; i <= len(key); i++ {
  400. k := string(key[:i])
  401. wm, ok := sm[k]
  402. if !ok {
  403. continue
  404. }
  405. for w := range wm {
  406. // the watcher needs to be notified when either it watches prefix or
  407. // the key is exactly matched.
  408. if !w.prefix && i != len(ev.Kv.Key) {
  409. continue
  410. }
  411. if _, ok := watcherToEvents[w]; !ok {
  412. watcherToEvents[w] = []storagepb.Event{}
  413. }
  414. watcherToEvents[w] = append(watcherToEvents[w], ev)
  415. }
  416. }
  417. }
  418. return watcherToEvents
  419. }