watchable_store.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package storage
  15. import (
  16. "log"
  17. "sync"
  18. "time"
  19. "github.com/coreos/etcd/storage/storagepb"
  20. )
  21. const (
  22. // chanBufLen is the length of the buffered chan
  23. // for sending out watched events.
  24. // TODO: find a good buf value. 1024 is just a random one that
  25. // seems to be reasonable.
  26. chanBufLen = 1024
  27. )
  28. type watchable interface {
  29. watch(key []byte, prefix bool, startRev int64, ch chan<- storagepb.Event) (*watching, CancelFunc)
  30. }
  31. type watchableStore struct {
  32. mu sync.Mutex
  33. *store
  34. // contains all unsynced watching that needs to sync events that have happened
  35. unsynced map[*watching]struct{}
  36. // contains all synced watching that are tracking the events that will happen
  37. // The key of the map is the key that the watching is watching on.
  38. synced map[string][]*watching
  39. tx *ongoingTx
  40. stopc chan struct{}
  41. wg sync.WaitGroup
  42. }
  43. func newWatchableStore(path string) *watchableStore {
  44. s := &watchableStore{
  45. store: newStore(path),
  46. unsynced: make(map[*watching]struct{}),
  47. synced: make(map[string][]*watching),
  48. stopc: make(chan struct{}),
  49. }
  50. s.wg.Add(1)
  51. go s.syncWatchingsLoop()
  52. return s
  53. }
  54. func (s *watchableStore) Put(key, value []byte) (rev int64) {
  55. s.mu.Lock()
  56. defer s.mu.Unlock()
  57. rev = s.store.Put(key, value)
  58. // TODO: avoid this range
  59. kvs, _, err := s.store.Range(key, nil, 0, rev)
  60. if err != nil {
  61. log.Panicf("unexpected range error (%v)", err)
  62. }
  63. s.handle(rev, storagepb.Event{
  64. Type: storagepb.PUT,
  65. Kv: &kvs[0],
  66. })
  67. return rev
  68. }
  69. func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
  70. s.mu.Lock()
  71. defer s.mu.Unlock()
  72. // TODO: avoid this range
  73. kvs, _, err := s.store.Range(key, end, 0, 0)
  74. if err != nil {
  75. log.Panicf("unexpected range error (%v)", err)
  76. }
  77. n, rev = s.store.DeleteRange(key, end)
  78. for _, kv := range kvs {
  79. s.handle(rev, storagepb.Event{
  80. Type: storagepb.DELETE,
  81. Kv: &storagepb.KeyValue{
  82. Key: kv.Key,
  83. },
  84. })
  85. }
  86. return n, rev
  87. }
  88. func (s *watchableStore) TxnBegin() int64 {
  89. s.mu.Lock()
  90. s.tx = newOngoingTx()
  91. return s.store.TxnBegin()
  92. }
  93. func (s *watchableStore) TxnPut(txnID int64, key, value []byte) (rev int64, err error) {
  94. rev, err = s.store.TxnPut(txnID, key, value)
  95. if err == nil {
  96. s.tx.put(string(key))
  97. }
  98. return rev, err
  99. }
  100. func (s *watchableStore) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
  101. kvs, _, err := s.store.TxnRange(txnID, key, end, 0, 0)
  102. if err != nil {
  103. log.Panicf("unexpected range error (%v)", err)
  104. }
  105. n, rev, err = s.store.TxnDeleteRange(txnID, key, end)
  106. if err == nil {
  107. for _, kv := range kvs {
  108. s.tx.del(string(kv.Key))
  109. }
  110. }
  111. return n, rev, err
  112. }
  113. func (s *watchableStore) TxnEnd(txnID int64) error {
  114. err := s.store.TxnEnd(txnID)
  115. if err != nil {
  116. return err
  117. }
  118. _, rev, _ := s.store.Range(nil, nil, 0, 0)
  119. for k := range s.tx.putm {
  120. kvs, _, err := s.store.Range([]byte(k), nil, 0, 0)
  121. if err != nil {
  122. log.Panicf("unexpected range error (%v)", err)
  123. }
  124. s.handle(rev, storagepb.Event{
  125. Type: storagepb.PUT,
  126. Kv: &kvs[0],
  127. })
  128. }
  129. for k := range s.tx.delm {
  130. s.handle(rev, storagepb.Event{
  131. Type: storagepb.DELETE,
  132. Kv: &storagepb.KeyValue{
  133. Key: []byte(k),
  134. },
  135. })
  136. }
  137. s.mu.Unlock()
  138. return nil
  139. }
  140. func (s *watchableStore) Close() error {
  141. close(s.stopc)
  142. s.wg.Wait()
  143. return s.store.Close()
  144. }
  145. func (s *watchableStore) NewWatcher() Watcher {
  146. return &watcher{
  147. watchable: s,
  148. ch: make(chan storagepb.Event, chanBufLen),
  149. }
  150. }
  151. func (s *watchableStore) watch(key []byte, prefix bool, startRev int64, ch chan<- storagepb.Event) (*watching, CancelFunc) {
  152. s.mu.Lock()
  153. defer s.mu.Unlock()
  154. wa := &watching{
  155. key: key,
  156. prefix: prefix,
  157. cur: startRev,
  158. ch: ch,
  159. }
  160. k := string(key)
  161. if startRev == 0 {
  162. s.synced[k] = append(s.synced[k], wa)
  163. } else {
  164. slowWatchingGauge.Inc()
  165. s.unsynced[wa] = struct{}{}
  166. }
  167. watchingGauge.Inc()
  168. cancel := CancelFunc(func() {
  169. s.mu.Lock()
  170. defer s.mu.Unlock()
  171. // remove global references of the watching
  172. if _, ok := s.unsynced[wa]; ok {
  173. delete(s.unsynced, wa)
  174. slowWatchingGauge.Dec()
  175. watchingGauge.Dec()
  176. return
  177. }
  178. for i, w := range s.synced[k] {
  179. if w == wa {
  180. s.synced[k] = append(s.synced[k][:i], s.synced[k][i+1:]...)
  181. watchingGauge.Dec()
  182. }
  183. }
  184. // If we cannot find it, it should have finished watch.
  185. })
  186. return wa, cancel
  187. }
  188. // syncWatchingsLoop syncs the watching in the unsyncd map every 100ms.
  189. func (s *watchableStore) syncWatchingsLoop() {
  190. defer s.wg.Done()
  191. for {
  192. s.mu.Lock()
  193. s.syncWatchings()
  194. s.mu.Unlock()
  195. select {
  196. case <-time.After(100 * time.Millisecond):
  197. case <-s.stopc:
  198. return
  199. }
  200. }
  201. }
  202. // syncWatchings syncs the watchings in the unsyncd map.
  203. func (s *watchableStore) syncWatchings() {
  204. _, curRev, _ := s.store.Range(nil, nil, 0, 0)
  205. for w := range s.unsynced {
  206. var end []byte
  207. if w.prefix {
  208. end = make([]byte, len(w.key))
  209. copy(end, w.key)
  210. end[len(w.key)-1]++
  211. }
  212. limit := cap(w.ch) - len(w.ch)
  213. // the channel is full, try it in the next round
  214. if limit == 0 {
  215. continue
  216. }
  217. evs, nextRev, err := s.store.RangeEvents(w.key, end, int64(limit), w.cur)
  218. if err != nil {
  219. // TODO: send error event to watching
  220. delete(s.unsynced, w)
  221. continue
  222. }
  223. // push events to the channel
  224. for _, ev := range evs {
  225. w.ch <- ev
  226. pendingEventsGauge.Inc()
  227. }
  228. // switch to tracking future events if needed
  229. if nextRev > curRev {
  230. s.synced[string(w.key)] = append(s.synced[string(w.key)], w)
  231. delete(s.unsynced, w)
  232. continue
  233. }
  234. // put it back to try it in the next round
  235. w.cur = nextRev
  236. }
  237. slowWatchingGauge.Set(float64(len(s.unsynced)))
  238. }
  239. // handle handles the change of the happening event on all watchings.
  240. func (s *watchableStore) handle(rev int64, ev storagepb.Event) {
  241. s.notify(rev, ev)
  242. }
  243. // notify notifies the fact that given event at the given rev just happened to
  244. // watchings that watch on the key of the event.
  245. func (s *watchableStore) notify(rev int64, ev storagepb.Event) {
  246. // check all prefixes of the key to notify all corresponded watchings
  247. for i := 0; i <= len(ev.Kv.Key); i++ {
  248. ws := s.synced[string(ev.Kv.Key[:i])]
  249. nws := ws[:0]
  250. for _, w := range ws {
  251. // the watching needs to be notified when either it watches prefix or
  252. // the key is exactly matched.
  253. if !w.prefix && i != len(ev.Kv.Key) {
  254. continue
  255. }
  256. select {
  257. case w.ch <- ev:
  258. pendingEventsGauge.Inc()
  259. nws = append(nws, w)
  260. default:
  261. w.cur = rev
  262. s.unsynced[w] = struct{}{}
  263. slowWatchingGauge.Inc()
  264. }
  265. }
  266. s.synced[string(ev.Kv.Key[:i])] = nws
  267. }
  268. }
  269. type ongoingTx struct {
  270. // keys put/deleted in the ongoing txn
  271. putm map[string]struct{}
  272. delm map[string]struct{}
  273. }
  274. func newOngoingTx() *ongoingTx {
  275. return &ongoingTx{
  276. putm: make(map[string]struct{}),
  277. delm: make(map[string]struct{}),
  278. }
  279. }
  280. func (tx *ongoingTx) put(k string) {
  281. tx.putm[k] = struct{}{}
  282. if _, ok := tx.delm[k]; ok {
  283. delete(tx.delm, k)
  284. }
  285. }
  286. func (tx *ongoingTx) del(k string) {
  287. tx.delm[k] = struct{}{}
  288. if _, ok := tx.putm[k]; ok {
  289. delete(tx.putm, k)
  290. }
  291. }
  292. type watching struct {
  293. // the watching key
  294. key []byte
  295. // prefix indicates if watching is on a key or a prefix.
  296. // If prefix is true, the watching is on a prefix.
  297. prefix bool
  298. // cur is the current watching revision.
  299. // If cur is behind the current revision of the KV,
  300. // watching is unsynced and needs to catch up.
  301. cur int64
  302. // a chan to send out the watched events.
  303. // The chan might be shared with other watchings.
  304. ch chan<- storagepb.Event
  305. }