control.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. package gocql
  2. import (
  3. "errors"
  4. "fmt"
  5. "log"
  6. "math/rand"
  7. "net"
  8. "strconv"
  9. "sync"
  10. "sync/atomic"
  11. "time"
  12. )
  13. // Ensure that the atomic variable is aligned to a 64bit boundary
  14. // so that atomic operations can be applied on 32bit architectures.
  15. type controlConn struct {
  16. connecting int64
  17. session *Session
  18. conn atomic.Value
  19. retry RetryPolicy
  20. closeWg sync.WaitGroup
  21. quit chan struct{}
  22. }
  23. func createControlConn(session *Session) *controlConn {
  24. control := &controlConn{
  25. session: session,
  26. quit: make(chan struct{}),
  27. retry: &SimpleRetryPolicy{NumRetries: 3},
  28. }
  29. control.conn.Store((*Conn)(nil))
  30. return control
  31. }
  32. func (c *controlConn) heartBeat() {
  33. defer c.closeWg.Done()
  34. for {
  35. select {
  36. case <-c.quit:
  37. return
  38. case <-time.After(5 * time.Second):
  39. }
  40. resp, err := c.writeFrame(&writeOptionsFrame{})
  41. if err != nil {
  42. goto reconn
  43. }
  44. switch resp.(type) {
  45. case *supportedFrame:
  46. continue
  47. case error:
  48. goto reconn
  49. default:
  50. panic(fmt.Sprintf("gocql: unknown frame in response to options: %T", resp))
  51. }
  52. reconn:
  53. c.reconnect(true)
  54. // time.Sleep(5 * time.Second)
  55. continue
  56. }
  57. }
  58. func (c *controlConn) connect(endpoints []string) error {
  59. // intial connection attmept, try to connect to each endpoint to get an initial
  60. // list of nodes.
  61. // shuffle endpoints so not all drivers will connect to the same initial
  62. // node.
  63. r := rand.New(rand.NewSource(time.Now().UnixNano()))
  64. perm := r.Perm(len(endpoints))
  65. shuffled := make([]string, len(endpoints))
  66. for i, endpoint := range endpoints {
  67. shuffled[perm[i]] = endpoint
  68. }
  69. // store that we are not connected so that reconnect wont happen if we error
  70. atomic.StoreInt64(&c.connecting, -1)
  71. var (
  72. conn *Conn
  73. err error
  74. )
  75. for _, addr := range shuffled {
  76. conn, err = c.session.connect(JoinHostPort(addr, c.session.cfg.Port), c)
  77. if err != nil {
  78. log.Printf("gocql: unable to control conn dial %v: %v\n", addr, err)
  79. continue
  80. }
  81. if err = c.registerEvents(conn); err != nil {
  82. conn.Close()
  83. continue
  84. }
  85. // we should fetch the initial ring here and update initial host data. So that
  86. // when we return from here we have a ring topology ready to go.
  87. break
  88. }
  89. if conn == nil {
  90. // this is fatal, not going to connect a session
  91. return err
  92. }
  93. c.conn.Store(conn)
  94. atomic.StoreInt64(&c.connecting, 0)
  95. c.closeWg.Add(1)
  96. go c.heartBeat()
  97. return nil
  98. }
  99. func (c *controlConn) registerEvents(conn *Conn) error {
  100. var events []string
  101. if !c.session.cfg.Events.DisableTopologyEvents {
  102. events = append(events, "TOPOLOGY_CHANGE")
  103. }
  104. if !c.session.cfg.Events.DisableNodeStatusEvents {
  105. events = append(events, "STATUS_CHANGE")
  106. }
  107. if !c.session.cfg.Events.DisableSchemaEvents {
  108. events = append(events, "SCHEMA_CHANGE")
  109. }
  110. if len(events) == 0 {
  111. return nil
  112. }
  113. framer, err := conn.exec(&writeRegisterFrame{
  114. events: events,
  115. }, nil)
  116. if err != nil {
  117. return err
  118. }
  119. frame, err := framer.parseFrame()
  120. if err != nil {
  121. return err
  122. } else if _, ok := frame.(*readyFrame); !ok {
  123. return fmt.Errorf("unexpected frame in response to register: got %T: %v\n", frame, frame)
  124. }
  125. return nil
  126. }
  127. func (c *controlConn) reconnect(refreshring bool) {
  128. // TODO: simplify this function, use session.ring to get hosts instead of the
  129. // connection pool
  130. if !atomic.CompareAndSwapInt64(&c.connecting, 0, 1) {
  131. return
  132. }
  133. success := false
  134. defer func() {
  135. // debounce reconnect a little
  136. if success {
  137. go func() {
  138. time.Sleep(500 * time.Millisecond)
  139. atomic.StoreInt64(&c.connecting, 0)
  140. }()
  141. } else {
  142. atomic.StoreInt64(&c.connecting, 0)
  143. }
  144. }()
  145. addr := c.addr()
  146. oldConn := c.conn.Load().(*Conn)
  147. if oldConn != nil {
  148. oldConn.Close()
  149. }
  150. var newConn *Conn
  151. if addr != "" {
  152. // try to connect to the old host
  153. conn, err := c.session.connect(addr, c)
  154. if err != nil {
  155. // host is dead
  156. // TODO: this is replicated in a few places
  157. ip, portStr, _ := net.SplitHostPort(addr)
  158. port, _ := strconv.Atoi(portStr)
  159. c.session.handleNodeDown(net.ParseIP(ip), port)
  160. } else {
  161. newConn = conn
  162. }
  163. }
  164. // TODO: should have our own roundrobbin for hosts so that we can try each
  165. // in succession and guantee that we get a different host each time.
  166. if newConn == nil {
  167. _, conn := c.session.pool.Pick(nil)
  168. if conn == nil {
  169. return
  170. }
  171. if conn == nil {
  172. return
  173. }
  174. var err error
  175. newConn, err = c.session.connect(conn.addr, c)
  176. if err != nil {
  177. // TODO: add log handler for things like this
  178. return
  179. }
  180. }
  181. if err := c.registerEvents(newConn); err != nil {
  182. // TODO: handle this case better
  183. newConn.Close()
  184. log.Printf("gocql: control unable to register events: %v\n", err)
  185. return
  186. }
  187. c.conn.Store(newConn)
  188. success = true
  189. if refreshring {
  190. c.session.hostSource.refreshRing()
  191. }
  192. }
  193. func (c *controlConn) HandleError(conn *Conn, err error, closed bool) {
  194. if !closed {
  195. return
  196. }
  197. oldConn := c.conn.Load().(*Conn)
  198. if oldConn != conn {
  199. return
  200. }
  201. c.reconnect(true)
  202. }
  203. func (c *controlConn) writeFrame(w frameWriter) (frame, error) {
  204. conn := c.conn.Load().(*Conn)
  205. if conn == nil {
  206. return nil, errNoControl
  207. }
  208. framer, err := conn.exec(w, nil)
  209. if err != nil {
  210. return nil, err
  211. }
  212. return framer.parseFrame()
  213. }
  214. func (c *controlConn) withConn(fn func(*Conn) *Iter) *Iter {
  215. const maxConnectAttempts = 5
  216. connectAttempts := 0
  217. for i := 0; i < maxConnectAttempts; i++ {
  218. conn := c.conn.Load().(*Conn)
  219. if conn == nil {
  220. if connectAttempts > maxConnectAttempts {
  221. break
  222. }
  223. connectAttempts++
  224. c.reconnect(false)
  225. continue
  226. }
  227. return fn(conn)
  228. }
  229. return &Iter{err: errNoControl}
  230. }
  231. // query will return nil if the connection is closed or nil
  232. func (c *controlConn) query(statement string, values ...interface{}) (iter *Iter) {
  233. q := c.session.Query(statement, values...).Consistency(One)
  234. for {
  235. iter = c.withConn(func(conn *Conn) *Iter {
  236. return conn.executeQuery(q)
  237. })
  238. q.attempts++
  239. if iter.err == nil || !c.retry.Attempt(q) {
  240. break
  241. }
  242. }
  243. return
  244. }
  245. func (c *controlConn) fetchHostInfo(addr net.IP, port int) (*HostInfo, error) {
  246. // TODO(zariel): we should probably move this into host_source or atleast
  247. // share code with it.
  248. hostname, _, err := net.SplitHostPort(c.addr())
  249. if err != nil {
  250. return nil, fmt.Errorf("unable to fetch host info, invalid conn addr: %q: %v", c.addr(), err)
  251. }
  252. isLocal := hostname == addr.String()
  253. var fn func(*HostInfo) error
  254. if isLocal {
  255. fn = func(host *HostInfo) error {
  256. // TODO(zariel): should we fetch rpc_address from here?
  257. iter := c.query("SELECT data_center, rack, host_id, tokens, release_version FROM system.local WHERE key='local'")
  258. iter.Scan(&host.dataCenter, &host.rack, &host.hostId, &host.tokens, &host.version)
  259. return iter.Close()
  260. }
  261. } else {
  262. fn = func(host *HostInfo) error {
  263. // TODO(zariel): should we fetch rpc_address from here?
  264. iter := c.query("SELECT data_center, rack, host_id, tokens, release_version FROM system.peers WHERE peer=?", addr)
  265. iter.Scan(&host.dataCenter, &host.rack, &host.hostId, &host.tokens, &host.version)
  266. return iter.Close()
  267. }
  268. }
  269. host := &HostInfo{
  270. port: port,
  271. }
  272. if err := fn(host); err != nil {
  273. return nil, err
  274. }
  275. host.peer = addr.String()
  276. return host, nil
  277. }
  278. func (c *controlConn) awaitSchemaAgreement() error {
  279. return c.withConn(func(conn *Conn) *Iter {
  280. return &Iter{err: conn.awaitSchemaAgreement()}
  281. }).err
  282. }
  283. func (c *controlConn) addr() string {
  284. conn := c.conn.Load().(*Conn)
  285. if conn == nil {
  286. return ""
  287. }
  288. return conn.addr
  289. }
  290. func (c *controlConn) close() {
  291. // TODO: handle more gracefully
  292. close(c.quit)
  293. c.closeWg.Wait()
  294. conn := c.conn.Load().(*Conn)
  295. if conn != nil {
  296. conn.Close()
  297. }
  298. }
  299. var errNoControl = errors.New("gocql: no control connection available")