watchable_store.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "fmt"
  17. "log"
  18. "math"
  19. "sync"
  20. "time"
  21. "github.com/coreos/etcd/storage/storagepb"
  22. )
  23. const (
  24. // chanBufLen is the length of the buffered chan
  25. // for sending out watched events.
  26. // TODO: find a good buf value. 1024 is just a random one that
  27. // seems to be reasonable.
  28. chanBufLen = 1024
  29. )
  30. type watchable interface {
  31. watch(key []byte, prefix bool, startRev, id int64, ch chan<- storagepb.Event) (*watching, CancelFunc)
  32. }
  33. type watchableStore struct {
  34. mu sync.Mutex
  35. *store
  36. // contains all unsynced watching that needs to sync events that have happened
  37. unsynced map[*watching]struct{}
  38. // contains all synced watching that are tracking the events that will happen
  39. // The key of the map is the key that the watching is watching on.
  40. synced map[string]map[*watching]struct{}
  41. tx *ongoingTx
  42. stopc chan struct{}
  43. wg sync.WaitGroup
  44. }
  45. func newWatchableStore(path string) *watchableStore {
  46. s := &watchableStore{
  47. store: newStore(path),
  48. unsynced: make(map[*watching]struct{}),
  49. synced: make(map[string]map[*watching]struct{}),
  50. stopc: make(chan struct{}),
  51. }
  52. s.wg.Add(1)
  53. go s.syncWatchingsLoop()
  54. return s
  55. }
  56. func (s *watchableStore) Put(key, value []byte) (rev int64) {
  57. s.mu.Lock()
  58. defer s.mu.Unlock()
  59. rev = s.store.Put(key, value)
  60. // TODO: avoid this range
  61. kvs, _, err := s.store.Range(key, nil, 0, rev)
  62. if err != nil {
  63. log.Panicf("unexpected range error (%v)", err)
  64. }
  65. s.handle(rev, storagepb.Event{
  66. Type: storagepb.PUT,
  67. Kv: &kvs[0],
  68. })
  69. return rev
  70. }
  71. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  72. s.mu.Lock()
  73. defer s.mu.Unlock()
  74. // TODO: avoid this range
  75. kvs, _, err := s.store.Range(key, end, 0, 0)
  76. if err != nil {
  77. log.Panicf("unexpected range error (%v)", err)
  78. }
  79. n, rev = s.store.DeleteRange(key, end)
  80. for _, kv := range kvs {
  81. s.handle(rev, storagepb.Event{
  82. Type: storagepb.DELETE,
  83. Kv: &storagepb.KeyValue{
  84. Key: kv.Key,
  85. },
  86. })
  87. }
  88. return n, rev
  89. }
  90. func (s *watchableStore) TxnBegin() int64 {
  91. s.mu.Lock()
  92. s.tx = newOngoingTx()
  93. return s.store.TxnBegin()
  94. }
  95. func (s *watchableStore) TxnPut(txnID int64, key, value []byte) (rev int64, err error) {
  96. rev, err = s.store.TxnPut(txnID, key, value)
  97. if err == nil {
  98. s.tx.put(string(key))
  99. }
  100. return rev, err
  101. }
  102. func (s *watchableStore) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
  103. kvs, _, err := s.store.TxnRange(txnID, key, end, 0, 0)
  104. if err != nil {
  105. log.Panicf("unexpected range error (%v)", err)
  106. }
  107. n, rev, err = s.store.TxnDeleteRange(txnID, key, end)
  108. if err == nil {
  109. for _, kv := range kvs {
  110. s.tx.del(string(kv.Key))
  111. }
  112. }
  113. return n, rev, err
  114. }
  115. func (s *watchableStore) TxnEnd(txnID int64) error {
  116. err := s.store.TxnEnd(txnID)
  117. if err != nil {
  118. return err
  119. }
  120. _, rev, _ := s.store.Range(nil, nil, 0, 0)
  121. for k := range s.tx.putm {
  122. kvs, _, err := s.store.Range([]byte(k), nil, 0, 0)
  123. if err != nil {
  124. log.Panicf("unexpected range error (%v)", err)
  125. }
  126. s.handle(rev, storagepb.Event{
  127. Type: storagepb.PUT,
  128. Kv: &kvs[0],
  129. })
  130. }
  131. for k := range s.tx.delm {
  132. s.handle(rev, storagepb.Event{
  133. Type: storagepb.DELETE,
  134. Kv: &storagepb.KeyValue{
  135. Key: []byte(k),
  136. },
  137. })
  138. }
  139. s.mu.Unlock()
  140. return nil
  141. }
  142. func (s *watchableStore) Close() error {
  143. close(s.stopc)
  144. s.wg.Wait()
  145. return s.store.Close()
  146. }
  147. func (s *watchableStore) NewWatcher() Watcher {
  148. watcherGauge.Inc()
  149. return &watcher{
  150. watchable: s,
  151. ch: make(chan storagepb.Event, chanBufLen),
  152. }
  153. }
  154. func (s *watchableStore) watch(key []byte, prefix bool, startRev, id int64, ch chan<- storagepb.Event) (*watching, CancelFunc) {
  155. s.mu.Lock()
  156. defer s.mu.Unlock()
  157. wa := &watching{
  158. key: key,
  159. prefix: prefix,
  160. cur: startRev,
  161. id: id,
  162. ch: ch,
  163. }
  164. k := string(key)
  165. if startRev == 0 {
  166. if err := unsafeAddWatching(&s.synced, k, wa); err != nil {
  167. log.Panicf("error unsafeAddWatching (%v) for key %s", err, k)
  168. }
  169. } else {
  170. slowWatchingGauge.Inc()
  171. s.unsynced[wa] = struct{}{}
  172. }
  173. watchingGauge.Inc()
  174. cancel := CancelFunc(func() {
  175. s.mu.Lock()
  176. defer s.mu.Unlock()
  177. // remove global references of the watching
  178. if _, ok := s.unsynced[wa]; ok {
  179. delete(s.unsynced, wa)
  180. slowWatchingGauge.Dec()
  181. watchingGauge.Dec()
  182. return
  183. }
  184. if v, ok := s.synced[k]; ok {
  185. if _, ok := v[wa]; ok {
  186. delete(v, wa)
  187. // if there is nothing in s.synced[k],
  188. // remove the key from the synced
  189. if len(v) == 0 {
  190. delete(s.synced, k)
  191. }
  192. watchingGauge.Dec()
  193. }
  194. }
  195. // If we cannot find it, it should have finished watch.
  196. })
  197. return wa, cancel
  198. }
  199. // syncWatchingsLoop syncs the watching in the unsyncd map every 100ms.
  200. func (s *watchableStore) syncWatchingsLoop() {
  201. defer s.wg.Done()
  202. for {
  203. s.mu.Lock()
  204. s.syncWatchings()
  205. s.mu.Unlock()
  206. select {
  207. case <-time.After(100 * time.Millisecond):
  208. case <-s.stopc:
  209. return
  210. }
  211. }
  212. }
  213. // syncWatchings periodically syncs unsynced watchings by: Iterate all unsynced
  214. // watchings to get the minimum revision within its range, skipping the
  215. // watching if its current revision is behind the compact revision of the
  216. // store. And use this minimum revision to get all key-value pairs. Then send
  217. // those events to watchings.
  218. func (s *watchableStore) syncWatchings() {
  219. s.store.mu.Lock()
  220. defer s.store.mu.Unlock()
  221. if len(s.unsynced) == 0 {
  222. return
  223. }
  224. // in order to find key-value pairs from unsynced watchings, we need to
  225. // find min revision index, and these revisions can be used to
  226. // query the backend store of key-value pairs
  227. minRev := int64(math.MaxInt64)
  228. curRev := s.store.currentRev.main
  229. compactionRev := s.store.compactMainRev
  230. // TODO: change unsynced struct type same to this
  231. keyToUnsynced := make(map[string]map[*watching]struct{})
  232. for w := range s.unsynced {
  233. k := string(w.key)
  234. if w.cur > curRev {
  235. panic("watching current revision should not exceed current revision")
  236. }
  237. if w.cur < compactionRev {
  238. // TODO: return error compacted to that watching instead of
  239. // just removing it sliently from unsynced.
  240. delete(s.unsynced, w)
  241. continue
  242. }
  243. if minRev >= w.cur {
  244. minRev = w.cur
  245. }
  246. if _, ok := keyToUnsynced[k]; !ok {
  247. keyToUnsynced[k] = make(map[*watching]struct{})
  248. }
  249. keyToUnsynced[k][w] = struct{}{}
  250. }
  251. minBytes, maxBytes := newRevBytes(), newRevBytes()
  252. revToBytes(revision{main: minRev}, minBytes)
  253. revToBytes(revision{main: curRev + 1}, maxBytes)
  254. // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
  255. // values are actual key-value pairs in backend.
  256. tx := s.store.b.BatchTx()
  257. tx.Lock()
  258. ks, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
  259. tx.Unlock()
  260. for i, v := range vs {
  261. var kv storagepb.KeyValue
  262. if err := kv.Unmarshal(v); err != nil {
  263. log.Panicf("storage: cannot unmarshal event: %v", err)
  264. }
  265. k := string(kv.Key)
  266. wm, ok := keyToUnsynced[k]
  267. if !ok {
  268. continue
  269. }
  270. var ev storagepb.Event
  271. switch {
  272. case isTombstone(ks[i]):
  273. ev.Type = storagepb.DELETE
  274. default:
  275. ev.Type = storagepb.PUT
  276. }
  277. ev.Kv = &kv
  278. for w := range wm {
  279. ev.WatchID = w.id
  280. select {
  281. case w.ch <- ev:
  282. pendingEventsGauge.Inc()
  283. default:
  284. // TODO: handle the full unsynced watchings.
  285. // continue to process other watchings for now, the full ones
  286. // will be processed next time and hopefully it will not be full.
  287. continue
  288. }
  289. if err := unsafeAddWatching(&s.synced, k, w); err != nil {
  290. log.Panicf("error unsafeAddWatching (%v) for key %s", err, k)
  291. }
  292. delete(s.unsynced, w)
  293. }
  294. }
  295. slowWatchingGauge.Set(float64(len(s.unsynced)))
  296. }
  297. // handle handles the change of the happening event on all watchings.
  298. func (s *watchableStore) handle(rev int64, ev storagepb.Event) {
  299. s.notify(rev, ev)
  300. }
  301. // notify notifies the fact that given event at the given rev just happened to
  302. // watchings that watch on the key of the event.
  303. func (s *watchableStore) notify(rev int64, ev storagepb.Event) {
  304. // check all prefixes of the key to notify all corresponded watchings
  305. for i := 0; i <= len(ev.Kv.Key); i++ {
  306. k := string(ev.Kv.Key[:i])
  307. if wm, ok := s.synced[k]; ok {
  308. for w := range wm {
  309. // the watching needs to be notified when either it watches prefix or
  310. // the key is exactly matched.
  311. if !w.prefix && i != len(ev.Kv.Key) {
  312. continue
  313. }
  314. ev.WatchID = w.id
  315. select {
  316. case w.ch <- ev:
  317. pendingEventsGauge.Inc()
  318. default:
  319. w.cur = rev
  320. s.unsynced[w] = struct{}{}
  321. delete(wm, w)
  322. slowWatchingGauge.Inc()
  323. }
  324. }
  325. }
  326. }
  327. }
  328. type ongoingTx struct {
  329. // keys put/deleted in the ongoing txn
  330. putm map[string]struct{}
  331. delm map[string]struct{}
  332. }
  333. func newOngoingTx() *ongoingTx {
  334. return &ongoingTx{
  335. putm: make(map[string]struct{}),
  336. delm: make(map[string]struct{}),
  337. }
  338. }
  339. func (tx *ongoingTx) put(k string) {
  340. tx.putm[k] = struct{}{}
  341. if _, ok := tx.delm[k]; ok {
  342. delete(tx.delm, k)
  343. }
  344. }
  345. func (tx *ongoingTx) del(k string) {
  346. tx.delm[k] = struct{}{}
  347. if _, ok := tx.putm[k]; ok {
  348. delete(tx.putm, k)
  349. }
  350. }
  351. type watching struct {
  352. // the watching key
  353. key []byte
  354. // prefix indicates if watching is on a key or a prefix.
  355. // If prefix is true, the watching is on a prefix.
  356. prefix bool
  357. // cur is the current watching revision.
  358. // If cur is behind the current revision of the KV,
  359. // watching is unsynced and needs to catch up.
  360. cur int64
  361. id int64
  362. // a chan to send out the watched events.
  363. // The chan might be shared with other watchings.
  364. ch chan<- storagepb.Event
  365. }
  366. // unsafeAddWatching puts watching with key k into watchableStore's synced.
  367. // Make sure to this is thread-safe using mutex before and after.
  368. func unsafeAddWatching(synced *map[string]map[*watching]struct{}, k string, wa *watching) error {
  369. if wa == nil {
  370. return fmt.Errorf("nil watching received")
  371. }
  372. mp := *synced
  373. if v, ok := mp[k]; ok {
  374. if _, ok := v[wa]; ok {
  375. return fmt.Errorf("put the same watch twice: %+v", wa)
  376. } else {
  377. v[wa] = struct{}{}
  378. }
  379. return nil
  380. }
  381. mp[k] = make(map[*watching]struct{})
  382. mp[k][wa] = struct{}{}
  383. return nil
  384. }