control.go 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. package gocql
  2. import (
  3. "errors"
  4. "fmt"
  5. "log"
  6. "math/rand"
  7. "net"
  8. "strconv"
  9. "sync"
  10. "sync/atomic"
  11. "time"
  12. )
  13. // Ensure that the atomic variable is aligned to a 64bit boundary
  14. // so that atomic operations can be applied on 32bit architectures.
  15. type controlConn struct {
  16. connecting uint64
  17. session *Session
  18. conn atomic.Value
  19. session *Session
  20. conn atomic.Value
  21. retry RetryPolicy
  22. closeWg sync.WaitGroup
  23. quit chan struct{}
  24. }
  25. func createControlConn(session *Session) *controlConn {
  26. control := &controlConn{
  27. session: session,
  28. quit: make(chan struct{}),
  29. retry: &SimpleRetryPolicy{NumRetries: 3},
  30. }
  31. control.conn.Store((*Conn)(nil))
  32. return control
  33. }
  34. func (c *controlConn) heartBeat() {
  35. defer c.closeWg.Done()
  36. for {
  37. select {
  38. case <-c.quit:
  39. return
  40. case <-time.After(5 * time.Second):
  41. }
  42. resp, err := c.writeFrame(&writeOptionsFrame{})
  43. if err != nil {
  44. goto reconn
  45. }
  46. switch resp.(type) {
  47. case *supportedFrame:
  48. continue
  49. case error:
  50. goto reconn
  51. default:
  52. panic(fmt.Sprintf("gocql: unknown frame in response to options: %T", resp))
  53. }
  54. reconn:
  55. c.reconnect(true)
  56. // time.Sleep(5 * time.Second)
  57. continue
  58. }
  59. }
  60. func (c *controlConn) connect(endpoints []string) error {
  61. // intial connection attmept, try to connect to each endpoint to get an initial
  62. // list of nodes.
  63. // shuffle endpoints so not all drivers will connect to the same initial
  64. // node.
  65. r := rand.New(rand.NewSource(time.Now().UnixNano()))
  66. perm := r.Perm(len(endpoints))
  67. shuffled := make([]string, len(endpoints))
  68. for i, endpoint := range endpoints {
  69. shuffled[perm[i]] = endpoint
  70. }
  71. // store that we are not connected so that reconnect wont happen if we error
  72. atomic.StoreInt64(&c.connecting, -1)
  73. var (
  74. conn *Conn
  75. err error
  76. )
  77. for _, addr := range shuffled {
  78. conn, err = c.session.connect(JoinHostPort(addr, c.session.cfg.Port), c)
  79. if err != nil {
  80. log.Printf("gocql: unable to control conn dial %v: %v\n", addr, err)
  81. continue
  82. }
  83. if err = c.registerEvents(conn); err != nil {
  84. conn.Close()
  85. continue
  86. }
  87. // we should fetch the initial ring here and update initial host data. So that
  88. // when we return from here we have a ring topology ready to go.
  89. break
  90. }
  91. if conn == nil {
  92. // this is fatal, not going to connect a session
  93. return err
  94. }
  95. c.conn.Store(conn)
  96. atomic.StoreInt64(&c.connecting, 0)
  97. c.closeWg.Add(1)
  98. go c.heartBeat()
  99. return nil
  100. }
  101. func (c *controlConn) registerEvents(conn *Conn) error {
  102. framer, err := conn.exec(&writeRegisterFrame{
  103. events: []string{"TOPOLOGY_CHANGE", "STATUS_CHANGE", "STATUS_CHANGE"},
  104. }, nil)
  105. if err != nil {
  106. return err
  107. }
  108. frame, err := framer.parseFrame()
  109. if err != nil {
  110. return err
  111. } else if _, ok := frame.(*readyFrame); !ok {
  112. return fmt.Errorf("unexpected frame in response to register: got %T: %v\n", frame, frame)
  113. }
  114. return nil
  115. }
  116. func (c *controlConn) reconnect(refreshring bool) {
  117. // TODO: simplify this function, use session.ring to get hosts instead of the
  118. // connection pool
  119. if !atomic.CompareAndSwapInt64(&c.connecting, 0, 1) {
  120. return
  121. }
  122. success := false
  123. defer func() {
  124. // debounce reconnect a little
  125. if success {
  126. go func() {
  127. time.Sleep(500 * time.Millisecond)
  128. atomic.StoreInt64(&c.connecting, 0)
  129. }()
  130. } else {
  131. atomic.StoreInt64(&c.connecting, 0)
  132. }
  133. }()
  134. addr := c.addr()
  135. oldConn := c.conn.Load().(*Conn)
  136. if oldConn != nil {
  137. oldConn.Close()
  138. }
  139. var newConn *Conn
  140. if addr != "" {
  141. // try to connect to the old host
  142. conn, err := c.session.connect(addr, c)
  143. if err != nil {
  144. // host is dead
  145. // TODO: this is replicated in a few places
  146. ip, portStr, _ := net.SplitHostPort(addr)
  147. port, _ := strconv.Atoi(portStr)
  148. c.session.handleNodeDown(net.ParseIP(ip), port)
  149. } else {
  150. newConn = conn
  151. }
  152. }
  153. // TODO: should have our own roundrobbin for hosts so that we can try each
  154. // in succession and guantee that we get a different host each time.
  155. if newConn == nil {
  156. _, conn := c.session.pool.Pick(nil)
  157. if conn == nil {
  158. return
  159. }
  160. if conn == nil {
  161. return
  162. }
  163. var err error
  164. newConn, err = c.session.connect(conn.addr, c)
  165. if err != nil {
  166. // TODO: add log handler for things like this
  167. return
  168. }
  169. }
  170. if err := c.registerEvents(newConn); err != nil {
  171. // TODO: handle this case better
  172. newConn.Close()
  173. log.Printf("gocql: control unable to register events: %v\n", err)
  174. return
  175. }
  176. c.conn.Store(newConn)
  177. success = true
  178. if refreshring {
  179. c.session.hostSource.refreshRing()
  180. }
  181. }
  182. func (c *controlConn) HandleError(conn *Conn, err error, closed bool) {
  183. if !closed {
  184. return
  185. }
  186. oldConn := c.conn.Load().(*Conn)
  187. if oldConn != conn {
  188. return
  189. }
  190. c.reconnect(true)
  191. }
  192. func (c *controlConn) writeFrame(w frameWriter) (frame, error) {
  193. conn := c.conn.Load().(*Conn)
  194. if conn == nil {
  195. return nil, errNoControl
  196. }
  197. framer, err := conn.exec(w, nil)
  198. if err != nil {
  199. return nil, err
  200. }
  201. return framer.parseFrame()
  202. }
  203. func (c *controlConn) withConn(fn func(*Conn) *Iter) *Iter {
  204. const maxConnectAttempts = 5
  205. connectAttempts := 0
  206. for i := 0; i < maxConnectAttempts; i++ {
  207. conn := c.conn.Load().(*Conn)
  208. if conn == nil {
  209. if connectAttempts > maxConnectAttempts {
  210. break
  211. }
  212. connectAttempts++
  213. c.reconnect(false)
  214. continue
  215. }
  216. return fn(conn)
  217. }
  218. return &Iter{err: errNoControl}
  219. }
  220. // query will return nil if the connection is closed or nil
  221. func (c *controlConn) query(statement string, values ...interface{}) (iter *Iter) {
  222. q := c.session.Query(statement, values...).Consistency(One)
  223. for {
  224. iter = c.withConn(func(conn *Conn) *Iter {
  225. return conn.executeQuery(q)
  226. })
  227. q.attempts++
  228. if iter.err == nil || !c.retry.Attempt(q) {
  229. break
  230. }
  231. }
  232. return
  233. }
  234. func (c *controlConn) fetchHostInfo(addr net.IP, port int) (*HostInfo, error) {
  235. // TODO(zariel): we should probably move this into host_source or atleast
  236. // share code with it.
  237. hostname, _, err := net.SplitHostPort(c.addr())
  238. if err != nil {
  239. return nil, fmt.Errorf("unable to fetch host info, invalid conn addr: %q: %v", c.addr(), err)
  240. }
  241. isLocal := hostname == addr.String()
  242. var fn func(*HostInfo) error
  243. if isLocal {
  244. fn = func(host *HostInfo) error {
  245. // TODO(zariel): should we fetch rpc_address from here?
  246. iter := c.query("SELECT data_center, rack, host_id, tokens, release_version FROM system.local WHERE key='local'")
  247. iter.Scan(&host.dataCenter, &host.rack, &host.hostId, &host.tokens, &host.version)
  248. return iter.Close()
  249. }
  250. } else {
  251. fn = func(host *HostInfo) error {
  252. // TODO(zariel): should we fetch rpc_address from here?
  253. iter := c.query("SELECT data_center, rack, host_id, tokens, release_version FROM system.peers WHERE peer=?", addr)
  254. iter.Scan(&host.dataCenter, &host.rack, &host.hostId, &host.tokens, &host.version)
  255. return iter.Close()
  256. }
  257. }
  258. host := &HostInfo{
  259. port: port,
  260. }
  261. if err := fn(host); err != nil {
  262. return nil, err
  263. }
  264. host.peer = addr.String()
  265. return host, nil
  266. }
  267. func (c *controlConn) awaitSchemaAgreement() error {
  268. return c.withConn(func(conn *Conn) *Iter {
  269. return &Iter{err: conn.awaitSchemaAgreement()}
  270. }).err
  271. }
  272. func (c *controlConn) addr() string {
  273. conn := c.conn.Load().(*Conn)
  274. if conn == nil {
  275. return ""
  276. }
  277. return conn.addr
  278. }
  279. func (c *controlConn) close() {
  280. // TODO: handle more gracefully
  281. close(c.quit)
  282. c.closeWg.Wait()
  283. conn := c.conn.Load().(*Conn)
  284. if conn != nil {
  285. conn.Close()
  286. }
  287. }
  288. var errNoControl = errors.New("gocql: no control connection available")