watchable_store.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "fmt"
  17. "log"
  18. "math"
  19. "strings"
  20. "sync"
  21. "time"
  22. "github.com/coreos/etcd/lease"
  23. "github.com/coreos/etcd/storage/backend"
  24. "github.com/coreos/etcd/storage/storagepb"
  25. )
  26. const (
  27. // chanBufLen is the length of the buffered chan
  28. // for sending out watched events.
  29. // TODO: find a good buf value. 1024 is just a random one that
  30. // seems to be reasonable.
  31. chanBufLen = 1024
  32. )
  33. type watchable interface {
  34. watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc)
  35. rev() int64
  36. }
  37. type watchableStore struct {
  38. mu sync.Mutex
  39. *store
  40. // contains all unsynced watchers that needs to sync with events that have happened
  41. unsynced map[*watcher]struct{}
  42. // contains all synced watchers that are in sync with the progress of the store.
  43. // The key of the map is the key that the watcher watches on.
  44. synced map[string]map[*watcher]struct{}
  45. tx *ongoingTx
  46. stopc chan struct{}
  47. wg sync.WaitGroup
  48. }
  49. // cancelFunc updates unsynced and synced maps when running
  50. // cancel operations.
  51. type cancelFunc func()
  52. func newWatchableStore(b backend.Backend, le lease.Lessor) *watchableStore {
  53. s := &watchableStore{
  54. store: NewStore(b, le),
  55. unsynced: make(map[*watcher]struct{}),
  56. synced: make(map[string]map[*watcher]struct{}),
  57. stopc: make(chan struct{}),
  58. }
  59. if s.le != nil {
  60. // use this store as the deleter so revokes trigger watch events
  61. s.le.SetRangeDeleter(s)
  62. }
  63. s.wg.Add(1)
  64. go s.syncWatchersLoop()
  65. return s
  66. }
  67. func (s *watchableStore) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
  68. s.mu.Lock()
  69. defer s.mu.Unlock()
  70. rev = s.store.Put(key, value, lease)
  71. // TODO: avoid this range
  72. kvs, _, err := s.store.Range(key, nil, 0, rev)
  73. if err != nil {
  74. log.Panicf("unexpected range error (%v)", err)
  75. }
  76. ev := storagepb.Event{
  77. Type: storagepb.PUT,
  78. Kv: &kvs[0],
  79. }
  80. s.handle(rev, []storagepb.Event{ev})
  81. return rev
  82. }
  83. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  84. s.mu.Lock()
  85. defer s.mu.Unlock()
  86. // TODO: avoid this range
  87. kvs, _, err := s.store.Range(key, end, 0, 0)
  88. if err != nil {
  89. log.Panicf("unexpected range error (%v)", err)
  90. }
  91. n, rev = s.store.DeleteRange(key, end)
  92. evs := make([]storagepb.Event, len(kvs))
  93. for i, kv := range kvs {
  94. evs[i] = storagepb.Event{
  95. Type: storagepb.DELETE,
  96. Kv: &storagepb.KeyValue{
  97. Key: kv.Key,
  98. }}
  99. }
  100. s.handle(rev, evs)
  101. return n, rev
  102. }
  103. func (s *watchableStore) TxnBegin() int64 {
  104. s.mu.Lock()
  105. s.tx = newOngoingTx()
  106. return s.store.TxnBegin()
  107. }
  108. func (s *watchableStore) TxnPut(txnID int64, key, value []byte, lease lease.LeaseID) (rev int64, err error) {
  109. rev, err = s.store.TxnPut(txnID, key, value, lease)
  110. if err == nil {
  111. s.tx.put(string(key))
  112. }
  113. return rev, err
  114. }
  115. func (s *watchableStore) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
  116. kvs, _, err := s.store.TxnRange(txnID, key, end, 0, 0)
  117. if err != nil {
  118. log.Panicf("unexpected range error (%v)", err)
  119. }
  120. n, rev, err = s.store.TxnDeleteRange(txnID, key, end)
  121. if err == nil {
  122. for _, kv := range kvs {
  123. s.tx.del(string(kv.Key))
  124. }
  125. }
  126. return n, rev, err
  127. }
  128. func (s *watchableStore) TxnEnd(txnID int64) error {
  129. err := s.store.TxnEnd(txnID)
  130. if err != nil {
  131. return err
  132. }
  133. _, rev, _ := s.store.Range(nil, nil, 0, 0)
  134. evs := []storagepb.Event{}
  135. for k := range s.tx.putm {
  136. kvs, _, err := s.store.Range([]byte(k), nil, 0, 0)
  137. if err != nil {
  138. log.Panicf("unexpected range error (%v)", err)
  139. }
  140. ev := storagepb.Event{
  141. Type: storagepb.PUT,
  142. Kv: &kvs[0],
  143. }
  144. evs = append(evs, ev)
  145. }
  146. for k := range s.tx.delm {
  147. ev := storagepb.Event{
  148. Type: storagepb.DELETE,
  149. Kv: &storagepb.KeyValue{
  150. Key: []byte(k),
  151. },
  152. }
  153. evs = append(evs, ev)
  154. }
  155. s.handle(rev, evs)
  156. s.mu.Unlock()
  157. return nil
  158. }
  159. func (s *watchableStore) Close() error {
  160. close(s.stopc)
  161. s.wg.Wait()
  162. return s.store.Close()
  163. }
  164. func (s *watchableStore) NewWatchStream() WatchStream {
  165. watchStreamGauge.Inc()
  166. return &watchStream{
  167. watchable: s,
  168. ch: make(chan WatchResponse, chanBufLen),
  169. cancels: make(map[WatchID]cancelFunc),
  170. }
  171. }
  172. func (s *watchableStore) watch(key []byte, prefix bool, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) {
  173. s.mu.Lock()
  174. defer s.mu.Unlock()
  175. wa := &watcher{
  176. key: key,
  177. prefix: prefix,
  178. cur: startRev,
  179. id: id,
  180. ch: ch,
  181. }
  182. k := string(key)
  183. if startRev == 0 {
  184. if err := unsafeAddWatcher(&s.synced, k, wa); err != nil {
  185. log.Panicf("error unsafeAddWatcher (%v) for key %s", err, k)
  186. }
  187. } else {
  188. slowWatcherGauge.Inc()
  189. s.unsynced[wa] = struct{}{}
  190. }
  191. watcherGauge.Inc()
  192. cancel := cancelFunc(func() {
  193. s.mu.Lock()
  194. defer s.mu.Unlock()
  195. // remove global references of the watcher
  196. if _, ok := s.unsynced[wa]; ok {
  197. delete(s.unsynced, wa)
  198. slowWatcherGauge.Dec()
  199. watcherGauge.Dec()
  200. return
  201. }
  202. if v, ok := s.synced[k]; ok {
  203. if _, ok := v[wa]; ok {
  204. delete(v, wa)
  205. // if there is nothing in s.synced[k],
  206. // remove the key from the synced
  207. if len(v) == 0 {
  208. delete(s.synced, k)
  209. }
  210. watcherGauge.Dec()
  211. }
  212. }
  213. // If we cannot find it, it should have finished watch.
  214. })
  215. return wa, cancel
  216. }
  217. // syncWatchersLoop syncs the watcher in the unsyncd map every 100ms.
  218. func (s *watchableStore) syncWatchersLoop() {
  219. defer s.wg.Done()
  220. for {
  221. s.mu.Lock()
  222. s.syncWatchers()
  223. s.mu.Unlock()
  224. select {
  225. case <-time.After(100 * time.Millisecond):
  226. case <-s.stopc:
  227. return
  228. }
  229. }
  230. }
  231. // syncWatchers periodically syncs unsynced watchers by: Iterate all unsynced
  232. // watchers to get the minimum revision within its range, skipping the
  233. // watcher if its current revision is behind the compact revision of the
  234. // store. And use this minimum revision to get all key-value pairs. Then send
  235. // those events to watchers.
  236. func (s *watchableStore) syncWatchers() {
  237. s.store.mu.Lock()
  238. defer s.store.mu.Unlock()
  239. if len(s.unsynced) == 0 {
  240. return
  241. }
  242. // in order to find key-value pairs from unsynced watchers, we need to
  243. // find min revision index, and these revisions can be used to
  244. // query the backend store of key-value pairs
  245. minRev := int64(math.MaxInt64)
  246. curRev := s.store.currentRev.main
  247. compactionRev := s.store.compactMainRev
  248. // TODO: change unsynced struct type same to this
  249. keyToUnsynced := make(map[string]map[*watcher]struct{})
  250. prefixes := make(map[string]struct{})
  251. for w := range s.unsynced {
  252. k := string(w.key)
  253. if w.cur > curRev {
  254. panic("watcher current revision should not exceed current revision")
  255. }
  256. if w.cur < compactionRev {
  257. // TODO: return error compacted to that watcher instead of
  258. // just removing it silently from unsynced.
  259. delete(s.unsynced, w)
  260. continue
  261. }
  262. if minRev >= w.cur {
  263. minRev = w.cur
  264. }
  265. if _, ok := keyToUnsynced[k]; !ok {
  266. keyToUnsynced[k] = make(map[*watcher]struct{})
  267. }
  268. keyToUnsynced[k][w] = struct{}{}
  269. if w.prefix {
  270. prefixes[k] = struct{}{}
  271. }
  272. }
  273. minBytes, maxBytes := newRevBytes(), newRevBytes()
  274. revToBytes(revision{main: minRev}, minBytes)
  275. revToBytes(revision{main: curRev + 1}, maxBytes)
  276. // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
  277. // values are actual key-value pairs in backend.
  278. tx := s.store.b.BatchTx()
  279. tx.Lock()
  280. ks, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
  281. tx.Unlock()
  282. evs := []storagepb.Event{}
  283. // get the list of all events from all key-value pairs
  284. for i, v := range vs {
  285. var kv storagepb.KeyValue
  286. if err := kv.Unmarshal(v); err != nil {
  287. log.Panicf("storage: cannot unmarshal event: %v", err)
  288. }
  289. k := string(kv.Key)
  290. if _, ok := keyToUnsynced[k]; !ok && !matchPrefix(k, prefixes) {
  291. continue
  292. }
  293. var ev storagepb.Event
  294. switch {
  295. case isTombstone(ks[i]):
  296. ev.Type = storagepb.DELETE
  297. default:
  298. ev.Type = storagepb.PUT
  299. }
  300. ev.Kv = &kv
  301. evs = append(evs, ev)
  302. }
  303. for w, es := range newWatcherToEventMap(keyToUnsynced, evs) {
  304. select {
  305. // s.store.Rev also uses Lock, so just return directly
  306. case w.ch <- WatchResponse{WatchID: w.id, Events: es, Revision: s.store.currentRev.main}:
  307. pendingEventsGauge.Add(float64(len(es)))
  308. default:
  309. // TODO: handle the full unsynced watchers.
  310. // continue to process other watchers for now, the full ones
  311. // will be processed next time and hopefully it will not be full.
  312. continue
  313. }
  314. k := string(w.key)
  315. if err := unsafeAddWatcher(&s.synced, k, w); err != nil {
  316. log.Panicf("error unsafeAddWatcher (%v) for key %s", err, k)
  317. }
  318. delete(s.unsynced, w)
  319. }
  320. slowWatcherGauge.Set(float64(len(s.unsynced)))
  321. }
  322. // handle handles the change of the happening event on all watchers.
  323. func (s *watchableStore) handle(rev int64, evs []storagepb.Event) {
  324. s.notify(rev, evs)
  325. }
  326. // notify notifies the fact that given event at the given rev just happened to
  327. // watchers that watch on the key of the event.
  328. func (s *watchableStore) notify(rev int64, evs []storagepb.Event) {
  329. we := newWatcherToEventMap(s.synced, evs)
  330. for _, wm := range s.synced {
  331. for w := range wm {
  332. es, ok := we[w]
  333. if !ok {
  334. continue
  335. }
  336. select {
  337. case w.ch <- WatchResponse{WatchID: w.id, Events: es, Revision: s.Rev()}:
  338. pendingEventsGauge.Add(float64(len(es)))
  339. default:
  340. // move slow watcher to unsynced
  341. w.cur = rev
  342. s.unsynced[w] = struct{}{}
  343. delete(wm, w)
  344. slowWatcherGauge.Inc()
  345. }
  346. }
  347. }
  348. }
  349. func (s *watchableStore) rev() int64 { return s.store.Rev() }
  350. type ongoingTx struct {
  351. // keys put/deleted in the ongoing txn
  352. putm map[string]struct{}
  353. delm map[string]struct{}
  354. }
  355. func newOngoingTx() *ongoingTx {
  356. return &ongoingTx{
  357. putm: make(map[string]struct{}),
  358. delm: make(map[string]struct{}),
  359. }
  360. }
  361. func (tx *ongoingTx) put(k string) {
  362. tx.putm[k] = struct{}{}
  363. if _, ok := tx.delm[k]; ok {
  364. delete(tx.delm, k)
  365. }
  366. }
  367. func (tx *ongoingTx) del(k string) {
  368. tx.delm[k] = struct{}{}
  369. if _, ok := tx.putm[k]; ok {
  370. delete(tx.putm, k)
  371. }
  372. }
  373. type watcher struct {
  374. // the watcher key
  375. key []byte
  376. // prefix indicates if watcher is on a key or a prefix.
  377. // If prefix is true, the watcher is on a prefix.
  378. prefix bool
  379. // cur is the current watcher revision.
  380. // If cur is behind the current revision of the KV,
  381. // watcher is unsynced and needs to catch up.
  382. cur int64
  383. id WatchID
  384. // a chan to send out the watch response.
  385. // The chan might be shared with other watchers.
  386. ch chan<- WatchResponse
  387. }
  388. // unsafeAddWatcher puts watcher with key k into watchableStore's synced.
  389. // Make sure to this is thread-safe using mutex before and after.
  390. func unsafeAddWatcher(synced *map[string]map[*watcher]struct{}, k string, wa *watcher) error {
  391. if wa == nil {
  392. return fmt.Errorf("nil watcher received")
  393. }
  394. mp := *synced
  395. if v, ok := mp[k]; ok {
  396. if _, ok := v[wa]; ok {
  397. return fmt.Errorf("put the same watcher twice: %+v", wa)
  398. } else {
  399. v[wa] = struct{}{}
  400. }
  401. return nil
  402. }
  403. mp[k] = make(map[*watcher]struct{})
  404. mp[k][wa] = struct{}{}
  405. return nil
  406. }
  407. // newWatcherToEventMap creates a map that has watcher as key and events as
  408. // value. It enables quick events look up by watcher.
  409. func newWatcherToEventMap(sm map[string]map[*watcher]struct{}, evs []storagepb.Event) map[*watcher][]storagepb.Event {
  410. watcherToEvents := make(map[*watcher][]storagepb.Event)
  411. for _, ev := range evs {
  412. key := string(ev.Kv.Key)
  413. // check all prefixes of the key to notify all corresponded watchers
  414. for i := 0; i <= len(key); i++ {
  415. k := string(key[:i])
  416. wm, ok := sm[k]
  417. if !ok {
  418. continue
  419. }
  420. for w := range wm {
  421. // the watcher needs to be notified when either it watches prefix or
  422. // the key is exactly matched.
  423. if !w.prefix && i != len(ev.Kv.Key) {
  424. continue
  425. }
  426. if _, ok := watcherToEvents[w]; !ok {
  427. watcherToEvents[w] = []storagepb.Event{}
  428. }
  429. watcherToEvents[w] = append(watcherToEvents[w], ev)
  430. }
  431. }
  432. }
  433. return watcherToEvents
  434. }
  435. // matchPrefix returns true if key has any matching prefix
  436. // from prefixes map.
  437. func matchPrefix(key string, prefixes map[string]struct{}) bool {
  438. for p := range prefixes {
  439. if strings.HasPrefix(key, p) {
  440. return true
  441. }
  442. }
  443. return false
  444. }