| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443 |
- // Copyright 2015 CoreOS, Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- package storage
- import (
- "fmt"
- "log"
- "math"
- "sync"
- "time"
- "github.com/coreos/etcd/storage/storagepb"
- )
- const (
- // chanBufLen is the length of the buffered chan
- // for sending out watched events.
- // TODO: find a good buf value. 1024 is just a random one that
- // seems to be reasonable.
- chanBufLen = 1024
- )
- type watchable interface {
- watch(key []byte, prefix bool, startRev, id int64, ch chan<- storagepb.Event) (*watching, CancelFunc)
- }
- type watchableStore struct {
- mu sync.Mutex
- *store
- // contains all unsynced watching that needs to sync events that have happened
- unsynced map[*watching]struct{}
- // contains all synced watching that are tracking the events that will happen
- // The key of the map is the key that the watching is watching on.
- synced map[string]map[*watching]struct{}
- tx *ongoingTx
- stopc chan struct{}
- wg sync.WaitGroup
- }
- func newWatchableStore(path string) *watchableStore {
- s := &watchableStore{
- store: newStore(path),
- unsynced: make(map[*watching]struct{}),
- synced: make(map[string]map[*watching]struct{}),
- stopc: make(chan struct{}),
- }
- s.wg.Add(1)
- go s.syncWatchingsLoop()
- return s
- }
- func (s *watchableStore) Put(key, value []byte) (rev int64) {
- s.mu.Lock()
- defer s.mu.Unlock()
- rev = s.store.Put(key, value)
- // TODO: avoid this range
- kvs, _, err := s.store.Range(key, nil, 0, rev)
- if err != nil {
- log.Panicf("unexpected range error (%v)", err)
- }
- s.handle(rev, storagepb.Event{
- Type: storagepb.PUT,
- Kv: &kvs[0],
- })
- return rev
- }
- func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
- s.mu.Lock()
- defer s.mu.Unlock()
- // TODO: avoid this range
- kvs, _, err := s.store.Range(key, end, 0, 0)
- if err != nil {
- log.Panicf("unexpected range error (%v)", err)
- }
- n, rev = s.store.DeleteRange(key, end)
- for _, kv := range kvs {
- s.handle(rev, storagepb.Event{
- Type: storagepb.DELETE,
- Kv: &storagepb.KeyValue{
- Key: kv.Key,
- },
- })
- }
- return n, rev
- }
- func (s *watchableStore) TxnBegin() int64 {
- s.mu.Lock()
- s.tx = newOngoingTx()
- return s.store.TxnBegin()
- }
- func (s *watchableStore) TxnPut(txnID int64, key, value []byte) (rev int64, err error) {
- rev, err = s.store.TxnPut(txnID, key, value)
- if err == nil {
- s.tx.put(string(key))
- }
- return rev, err
- }
- func (s *watchableStore) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
- kvs, _, err := s.store.TxnRange(txnID, key, end, 0, 0)
- if err != nil {
- log.Panicf("unexpected range error (%v)", err)
- }
- n, rev, err = s.store.TxnDeleteRange(txnID, key, end)
- if err == nil {
- for _, kv := range kvs {
- s.tx.del(string(kv.Key))
- }
- }
- return n, rev, err
- }
- func (s *watchableStore) TxnEnd(txnID int64) error {
- err := s.store.TxnEnd(txnID)
- if err != nil {
- return err
- }
- _, rev, _ := s.store.Range(nil, nil, 0, 0)
- for k := range s.tx.putm {
- kvs, _, err := s.store.Range([]byte(k), nil, 0, 0)
- if err != nil {
- log.Panicf("unexpected range error (%v)", err)
- }
- s.handle(rev, storagepb.Event{
- Type: storagepb.PUT,
- Kv: &kvs[0],
- })
- }
- for k := range s.tx.delm {
- s.handle(rev, storagepb.Event{
- Type: storagepb.DELETE,
- Kv: &storagepb.KeyValue{
- Key: []byte(k),
- },
- })
- }
- s.mu.Unlock()
- return nil
- }
- func (s *watchableStore) Close() error {
- close(s.stopc)
- s.wg.Wait()
- return s.store.Close()
- }
- func (s *watchableStore) NewWatcher() Watcher {
- watcherGauge.Inc()
- return &watcher{
- watchable: s,
- ch: make(chan storagepb.Event, chanBufLen),
- }
- }
- func (s *watchableStore) watch(key []byte, prefix bool, startRev, id int64, ch chan<- storagepb.Event) (*watching, CancelFunc) {
- s.mu.Lock()
- defer s.mu.Unlock()
- wa := &watching{
- key: key,
- prefix: prefix,
- cur: startRev,
- id: id,
- ch: ch,
- }
- k := string(key)
- if startRev == 0 {
- if err := unsafeAddWatching(&s.synced, k, wa); err != nil {
- log.Panicf("error unsafeAddWatching (%v) for key %s", err, k)
- }
- } else {
- slowWatchingGauge.Inc()
- s.unsynced[wa] = struct{}{}
- }
- watchingGauge.Inc()
- cancel := CancelFunc(func() {
- s.mu.Lock()
- defer s.mu.Unlock()
- // remove global references of the watching
- if _, ok := s.unsynced[wa]; ok {
- delete(s.unsynced, wa)
- slowWatchingGauge.Dec()
- watchingGauge.Dec()
- return
- }
- if v, ok := s.synced[k]; ok {
- if _, ok := v[wa]; ok {
- delete(v, wa)
- // if there is nothing in s.synced[k],
- // remove the key from the synced
- if len(v) == 0 {
- delete(s.synced, k)
- }
- watchingGauge.Dec()
- }
- }
- // If we cannot find it, it should have finished watch.
- })
- return wa, cancel
- }
- // syncWatchingsLoop syncs the watching in the unsyncd map every 100ms.
- func (s *watchableStore) syncWatchingsLoop() {
- defer s.wg.Done()
- for {
- s.mu.Lock()
- s.syncWatchings()
- s.mu.Unlock()
- select {
- case <-time.After(100 * time.Millisecond):
- case <-s.stopc:
- return
- }
- }
- }
- // syncWatchings periodically syncs unsynced watchings by: Iterate all unsynced
- // watchings to get the minimum revision within its range, skipping the
- // watching if its current revision is behind the compact revision of the
- // store. And use this minimum revision to get all key-value pairs. Then send
- // those events to watchings.
- func (s *watchableStore) syncWatchings() {
- s.store.mu.Lock()
- defer s.store.mu.Unlock()
- if len(s.unsynced) == 0 {
- return
- }
- // in order to find key-value pairs from unsynced watchings, we need to
- // find min revision index, and these revisions can be used to
- // query the backend store of key-value pairs
- minRev := int64(math.MaxInt64)
- curRev := s.store.currentRev.main
- compactionRev := s.store.compactMainRev
- // TODO: change unsynced struct type same to this
- keyToUnsynced := make(map[string]map[*watching]struct{})
- for w := range s.unsynced {
- k := string(w.key)
- if w.cur > curRev {
- panic("watching current revision should not exceed current revision")
- }
- if w.cur < compactionRev {
- // TODO: return error compacted to that watching instead of
- // just removing it sliently from unsynced.
- delete(s.unsynced, w)
- continue
- }
- if minRev >= w.cur {
- minRev = w.cur
- }
- if _, ok := keyToUnsynced[k]; !ok {
- keyToUnsynced[k] = make(map[*watching]struct{})
- }
- keyToUnsynced[k][w] = struct{}{}
- }
- minBytes, maxBytes := newRevBytes(), newRevBytes()
- revToBytes(revision{main: minRev}, minBytes)
- revToBytes(revision{main: curRev + 1}, maxBytes)
- // UnsafeRange returns keys and values. And in boltdb, keys are revisions.
- // values are actual key-value pairs in backend.
- tx := s.store.b.BatchTx()
- tx.Lock()
- ks, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
- tx.Unlock()
- for i, v := range vs {
- var kv storagepb.KeyValue
- if err := kv.Unmarshal(v); err != nil {
- log.Panicf("storage: cannot unmarshal event: %v", err)
- }
- k := string(kv.Key)
- wm, ok := keyToUnsynced[k]
- if !ok {
- continue
- }
- var ev storagepb.Event
- switch {
- case isTombstone(ks[i]):
- ev.Type = storagepb.DELETE
- default:
- ev.Type = storagepb.PUT
- }
- ev.Kv = &kv
- for w := range wm {
- ev.WatchID = w.id
- select {
- case w.ch <- ev:
- pendingEventsGauge.Inc()
- default:
- // TODO: handle the full unsynced watchings.
- // continue to process other watchings for now, the full ones
- // will be processed next time and hopefully it will not be full.
- continue
- }
- if err := unsafeAddWatching(&s.synced, k, w); err != nil {
- log.Panicf("error unsafeAddWatching (%v) for key %s", err, k)
- }
- delete(s.unsynced, w)
- }
- }
- slowWatchingGauge.Set(float64(len(s.unsynced)))
- }
- // handle handles the change of the happening event on all watchings.
- func (s *watchableStore) handle(rev int64, ev storagepb.Event) {
- s.notify(rev, ev)
- }
- // notify notifies the fact that given event at the given rev just happened to
- // watchings that watch on the key of the event.
- func (s *watchableStore) notify(rev int64, ev storagepb.Event) {
- // check all prefixes of the key to notify all corresponded watchings
- for i := 0; i <= len(ev.Kv.Key); i++ {
- k := string(ev.Kv.Key[:i])
- if wm, ok := s.synced[k]; ok {
- for w := range wm {
- // the watching needs to be notified when either it watches prefix or
- // the key is exactly matched.
- if !w.prefix && i != len(ev.Kv.Key) {
- continue
- }
- ev.WatchID = w.id
- select {
- case w.ch <- ev:
- pendingEventsGauge.Inc()
- default:
- w.cur = rev
- s.unsynced[w] = struct{}{}
- delete(wm, w)
- slowWatchingGauge.Inc()
- }
- }
- }
- }
- }
- type ongoingTx struct {
- // keys put/deleted in the ongoing txn
- putm map[string]struct{}
- delm map[string]struct{}
- }
- func newOngoingTx() *ongoingTx {
- return &ongoingTx{
- putm: make(map[string]struct{}),
- delm: make(map[string]struct{}),
- }
- }
- func (tx *ongoingTx) put(k string) {
- tx.putm[k] = struct{}{}
- if _, ok := tx.delm[k]; ok {
- delete(tx.delm, k)
- }
- }
- func (tx *ongoingTx) del(k string) {
- tx.delm[k] = struct{}{}
- if _, ok := tx.putm[k]; ok {
- delete(tx.putm, k)
- }
- }
- type watching struct {
- // the watching key
- key []byte
- // prefix indicates if watching is on a key or a prefix.
- // If prefix is true, the watching is on a prefix.
- prefix bool
- // cur is the current watching revision.
- // If cur is behind the current revision of the KV,
- // watching is unsynced and needs to catch up.
- cur int64
- id int64
- // a chan to send out the watched events.
- // The chan might be shared with other watchings.
- ch chan<- storagepb.Event
- }
- // unsafeAddWatching puts watching with key k into watchableStore's synced.
- // Make sure to this is thread-safe using mutex before and after.
- func unsafeAddWatching(synced *map[string]map[*watching]struct{}, k string, wa *watching) error {
- if wa == nil {
- return fmt.Errorf("nil watching received")
- }
- mp := *synced
- if v, ok := mp[k]; ok {
- if _, ok := v[wa]; ok {
- return fmt.Errorf("put the same watch twice: %+v", wa)
- } else {
- v[wa] = struct{}{}
- }
- return nil
- }
- mp[k] = make(map[*watching]struct{})
- mp[k][wa] = struct{}{}
- return nil
- }
|