watchable_store.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "log"
  17. "sync"
  18. "time"
  19. "github.com/coreos/etcd/storage/storagepb"
  20. )
  21. const (
  22. // chanBufLen is the length of the buffered chan
  23. // for sending out watched events.
  24. // TODO: find a good buf value. 1024 is just a random one that
  25. // seems to be reasonable.
  26. chanBufLen = 1024
  27. )
  28. type watchable interface {
  29. watch(key []byte, prefix bool, startRev int64, ch chan<- storagepb.Event) (*watching, CancelFunc)
  30. }
  31. type watchableStore struct {
  32. mu sync.Mutex
  33. *store
  34. // contains all unsynced watching that needs to sync events that have happened
  35. unsynced map[*watching]struct{}
  36. // contains all synced watching that are tracking the events that will happen
  37. // The key of the map is the key that the watching is watching on.
  38. synced map[string][]*watching
  39. tx *ongoingTx
  40. stopc chan struct{}
  41. wg sync.WaitGroup
  42. }
  43. func newWatchableStore(path string) *watchableStore {
  44. s := &watchableStore{
  45. store: newStore(path),
  46. unsynced: make(map[*watching]struct{}),
  47. synced: make(map[string][]*watching),
  48. stopc: make(chan struct{}),
  49. }
  50. s.wg.Add(1)
  51. go s.syncWatchingsLoop()
  52. return s
  53. }
  54. func (s *watchableStore) Put(key, value []byte) (rev int64) {
  55. s.mu.Lock()
  56. defer s.mu.Unlock()
  57. rev = s.store.Put(key, value)
  58. // TODO: avoid this range
  59. kvs, _, err := s.store.Range(key, nil, 0, rev)
  60. if err != nil {
  61. log.Panicf("unexpected range error (%v)", err)
  62. }
  63. s.handle(rev, storagepb.Event{
  64. Type: storagepb.PUT,
  65. Kv: &kvs[0],
  66. })
  67. return rev
  68. }
  69. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  70. s.mu.Lock()
  71. defer s.mu.Unlock()
  72. // TODO: avoid this range
  73. kvs, _, err := s.store.Range(key, end, 0, 0)
  74. if err != nil {
  75. log.Panicf("unexpected range error (%v)", err)
  76. }
  77. n, rev = s.store.DeleteRange(key, end)
  78. for _, kv := range kvs {
  79. s.handle(rev, storagepb.Event{
  80. Type: storagepb.DELETE,
  81. Kv: &storagepb.KeyValue{
  82. Key: kv.Key,
  83. },
  84. })
  85. }
  86. return n, rev
  87. }
  88. func (s *watchableStore) TxnBegin() int64 {
  89. s.mu.Lock()
  90. s.tx = newOngoingTx()
  91. return s.store.TxnBegin()
  92. }
  93. func (s *watchableStore) TxnPut(txnID int64, key, value []byte) (rev int64, err error) {
  94. rev, err = s.store.TxnPut(txnID, key, value)
  95. if err == nil {
  96. s.tx.put(string(key))
  97. }
  98. return rev, err
  99. }
  100. func (s *watchableStore) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
  101. kvs, _, err := s.store.TxnRange(txnID, key, end, 0, 0)
  102. if err != nil {
  103. log.Panicf("unexpected range error (%v)", err)
  104. }
  105. n, rev, err = s.store.TxnDeleteRange(txnID, key, end)
  106. if err == nil {
  107. for _, kv := range kvs {
  108. s.tx.del(string(kv.Key))
  109. }
  110. }
  111. return n, rev, err
  112. }
  113. func (s *watchableStore) TxnEnd(txnID int64) error {
  114. err := s.store.TxnEnd(txnID)
  115. if err != nil {
  116. return err
  117. }
  118. _, rev, _ := s.store.Range(nil, nil, 0, 0)
  119. for k := range s.tx.putm {
  120. kvs, _, err := s.store.Range([]byte(k), nil, 0, 0)
  121. if err != nil {
  122. log.Panicf("unexpected range error (%v)", err)
  123. }
  124. s.handle(rev, storagepb.Event{
  125. Type: storagepb.PUT,
  126. Kv: &kvs[0],
  127. })
  128. }
  129. for k := range s.tx.delm {
  130. s.handle(rev, storagepb.Event{
  131. Type: storagepb.DELETE,
  132. Kv: &storagepb.KeyValue{
  133. Key: []byte(k),
  134. },
  135. })
  136. }
  137. s.mu.Unlock()
  138. return nil
  139. }
  140. func (s *watchableStore) Close() error {
  141. close(s.stopc)
  142. s.wg.Wait()
  143. return s.store.Close()
  144. }
  145. func (s *watchableStore) NewWatcher() Watcher {
  146. watcherGauge.Inc()
  147. return &watcher{
  148. watchable: s,
  149. ch: make(chan storagepb.Event, chanBufLen),
  150. }
  151. }
  152. func (s *watchableStore) watch(key []byte, prefix bool, startRev int64, ch chan<- storagepb.Event) (*watching, CancelFunc) {
  153. s.mu.Lock()
  154. defer s.mu.Unlock()
  155. wa := &watching{
  156. key: key,
  157. prefix: prefix,
  158. cur: startRev,
  159. ch: ch,
  160. }
  161. k := string(key)
  162. if startRev == 0 {
  163. s.synced[k] = append(s.synced[k], wa)
  164. } else {
  165. slowWatchingGauge.Inc()
  166. s.unsynced[wa] = struct{}{}
  167. }
  168. watchingGauge.Inc()
  169. cancel := CancelFunc(func() {
  170. s.mu.Lock()
  171. defer s.mu.Unlock()
  172. // remove global references of the watching
  173. if _, ok := s.unsynced[wa]; ok {
  174. delete(s.unsynced, wa)
  175. slowWatchingGauge.Dec()
  176. watchingGauge.Dec()
  177. return
  178. }
  179. for i, w := range s.synced[k] {
  180. if w == wa {
  181. s.synced[k] = append(s.synced[k][:i], s.synced[k][i+1:]...)
  182. watchingGauge.Dec()
  183. }
  184. }
  185. // If we cannot find it, it should have finished watch.
  186. })
  187. return wa, cancel
  188. }
  189. // syncWatchingsLoop syncs the watching in the unsyncd map every 100ms.
  190. func (s *watchableStore) syncWatchingsLoop() {
  191. defer s.wg.Done()
  192. for {
  193. s.mu.Lock()
  194. s.syncWatchings()
  195. s.mu.Unlock()
  196. select {
  197. case <-time.After(100 * time.Millisecond):
  198. case <-s.stopc:
  199. return
  200. }
  201. }
  202. }
  203. // syncWatchings syncs the watchings in the unsyncd map.
  204. func (s *watchableStore) syncWatchings() {
  205. _, curRev, _ := s.store.Range(nil, nil, 0, 0)
  206. for w := range s.unsynced {
  207. var end []byte
  208. if w.prefix {
  209. end = make([]byte, len(w.key))
  210. copy(end, w.key)
  211. end[len(w.key)-1]++
  212. }
  213. limit := cap(w.ch) - len(w.ch)
  214. // the channel is full, try it in the next round
  215. if limit == 0 {
  216. continue
  217. }
  218. evs, nextRev, err := s.store.RangeEvents(w.key, end, int64(limit), w.cur)
  219. if err != nil {
  220. // TODO: send error event to watching
  221. delete(s.unsynced, w)
  222. continue
  223. }
  224. // push events to the channel
  225. for _, ev := range evs {
  226. w.ch <- ev
  227. pendingEventsGauge.Inc()
  228. }
  229. // switch to tracking future events if needed
  230. if nextRev > curRev {
  231. s.synced[string(w.key)] = append(s.synced[string(w.key)], w)
  232. delete(s.unsynced, w)
  233. continue
  234. }
  235. // put it back to try it in the next round
  236. w.cur = nextRev
  237. }
  238. slowWatchingGauge.Set(float64(len(s.unsynced)))
  239. }
  240. // handle handles the change of the happening event on all watchings.
  241. func (s *watchableStore) handle(rev int64, ev storagepb.Event) {
  242. s.notify(rev, ev)
  243. }
  244. // notify notifies the fact that given event at the given rev just happened to
  245. // watchings that watch on the key of the event.
  246. func (s *watchableStore) notify(rev int64, ev storagepb.Event) {
  247. // check all prefixes of the key to notify all corresponded watchings
  248. for i := 0; i <= len(ev.Kv.Key); i++ {
  249. ws := s.synced[string(ev.Kv.Key[:i])]
  250. nws := ws[:0]
  251. for _, w := range ws {
  252. // the watching needs to be notified when either it watches prefix or
  253. // the key is exactly matched.
  254. if !w.prefix && i != len(ev.Kv.Key) {
  255. continue
  256. }
  257. select {
  258. case w.ch <- ev:
  259. pendingEventsGauge.Inc()
  260. nws = append(nws, w)
  261. default:
  262. w.cur = rev
  263. s.unsynced[w] = struct{}{}
  264. slowWatchingGauge.Inc()
  265. }
  266. }
  267. s.synced[string(ev.Kv.Key[:i])] = nws
  268. }
  269. }
  270. type ongoingTx struct {
  271. // keys put/deleted in the ongoing txn
  272. putm map[string]struct{}
  273. delm map[string]struct{}
  274. }
  275. func newOngoingTx() *ongoingTx {
  276. return &ongoingTx{
  277. putm: make(map[string]struct{}),
  278. delm: make(map[string]struct{}),
  279. }
  280. }
  281. func (tx *ongoingTx) put(k string) {
  282. tx.putm[k] = struct{}{}
  283. if _, ok := tx.delm[k]; ok {
  284. delete(tx.delm, k)
  285. }
  286. }
  287. func (tx *ongoingTx) del(k string) {
  288. tx.delm[k] = struct{}{}
  289. if _, ok := tx.putm[k]; ok {
  290. delete(tx.putm, k)
  291. }
  292. }
  293. type watching struct {
  294. // the watching key
  295. key []byte
  296. // prefix indicates if watching is on a key or a prefix.
  297. // If prefix is true, the watching is on a prefix.
  298. prefix bool
  299. // cur is the current watching revision.
  300. // If cur is behind the current revision of the KV,
  301. // watching is unsynced and needs to catch up.
  302. cur int64
  303. // a chan to send out the watched events.
  304. // The chan might be shared with other watchings.
  305. ch chan<- storagepb.Event
  306. }