Browse Source

control: handle no hosts in connection pool

When the control connection attempts to reconnect and there are no hosts
in the connection pool, fallback to trying the initial endpoints
shuffled again.

Set the host to up after the control connection connects to it.

Simplify control connection reconnecting logic, it is threadsafe as the
only method calling the connect methods is the heartbeater and the
inital connection attempt.
Chris Bannister 10 years ago
parent
commit
81591930ed
3 changed files with 130 additions and 65 deletions
  1. 73 65
      control.go
  2. 53 0
      events_ccm_test.go
  3. 4 0
      session.go

+ 73 - 65
control.go

@@ -1,28 +1,39 @@
 package gocql
 package gocql
 
 
 import (
 import (
+	crand "crypto/rand"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"log"
 	"log"
 	"math/rand"
 	"math/rand"
 	"net"
 	"net"
 	"strconv"
 	"strconv"
-	"sync"
 	"sync/atomic"
 	"sync/atomic"
 	"time"
 	"time"
 )
 )
 
 
+var (
+	randr *rand.Rand
+)
+
+func init() {
+	b := make([]byte, 4)
+	if _, err := crand.Read(b); err != nil {
+		panic(fmt.Sprintf("unable to seed random number generator: %v", err))
+	}
+
+	randr = rand.New(rand.NewSource(int64(readInt(b))))
+}
+
 // Ensure that the atomic variable is aligned to a 64bit boundary
 // Ensure that the atomic variable is aligned to a 64bit boundary
 // so that atomic operations can be applied on 32bit architectures.
 // so that atomic operations can be applied on 32bit architectures.
 type controlConn struct {
 type controlConn struct {
-	connecting int64
-
 	session *Session
 	session *Session
 	conn    atomic.Value
 	conn    atomic.Value
 
 
 	retry RetryPolicy
 	retry RetryPolicy
 
 
-	closeWg sync.WaitGroup
+	started int32
 	quit    chan struct{}
 	quit    chan struct{}
 }
 }
 
 
@@ -39,13 +50,17 @@ func createControlConn(session *Session) *controlConn {
 }
 }
 
 
 func (c *controlConn) heartBeat() {
 func (c *controlConn) heartBeat() {
-	defer c.closeWg.Done()
+	if !atomic.CompareAndSwapInt32(&c.started, 0, 1) {
+		return
+	}
+
+	sleepTime := 1 * time.Second
 
 
 	for {
 	for {
 		select {
 		select {
 		case <-c.quit:
 		case <-c.quit:
 			return
 			return
-		case <-time.After(5 * time.Second):
+		case <-time.After(sleepTime):
 		}
 		}
 
 
 		resp, err := c.writeFrame(&writeOptionsFrame{})
 		resp, err := c.writeFrame(&writeOptionsFrame{})
@@ -55,6 +70,8 @@ func (c *controlConn) heartBeat() {
 
 
 		switch resp.(type) {
 		switch resp.(type) {
 		case *supportedFrame:
 		case *supportedFrame:
+			// Everything ok
+			sleepTime = 5 * time.Second
 			continue
 			continue
 		case error:
 		case error:
 			goto reconn
 			goto reconn
@@ -63,65 +80,79 @@ func (c *controlConn) heartBeat() {
 		}
 		}
 
 
 	reconn:
 	reconn:
+		// try to connect a bit faster
+		sleepTime = 1 * time.Second
 		c.reconnect(true)
 		c.reconnect(true)
 		// time.Sleep(5 * time.Second)
 		// time.Sleep(5 * time.Second)
 		continue
 		continue
 	}
 	}
 }
 }
 
 
-func (c *controlConn) connect(endpoints []string) error {
-	// intial connection attmept, try to connect to each endpoint to get an initial
-	// list of nodes.
-
-	// shuffle endpoints so not all drivers will connect to the same initial
-	// node.
-	r := rand.New(rand.NewSource(time.Now().UnixNano()))
-	perm := r.Perm(len(endpoints))
+func (c *controlConn) shuffleDial(endpoints []string) (conn *Conn, err error) {
+	perm := randr.Perm(len(endpoints))
 	shuffled := make([]string, len(endpoints))
 	shuffled := make([]string, len(endpoints))
 
 
 	for i, endpoint := range endpoints {
 	for i, endpoint := range endpoints {
 		shuffled[perm[i]] = endpoint
 		shuffled[perm[i]] = endpoint
 	}
 	}
 
 
-	// store that we are not connected so that reconnect wont happen if we error
-	atomic.StoreInt64(&c.connecting, -1)
-
-	var (
-		conn *Conn
-		err  error
-	)
-
+	// shuffle endpoints so not all drivers will connect to the same initial
+	// node.
 	for _, addr := range shuffled {
 	for _, addr := range shuffled {
 		conn, err = c.session.connect(JoinHostPort(addr, c.session.cfg.Port), c)
 		conn, err = c.session.connect(JoinHostPort(addr, c.session.cfg.Port), c)
-		if err != nil {
-			log.Printf("gocql: unable to control conn dial %v: %v\n", addr, err)
-			continue
+		if err == nil {
+			return
 		}
 		}
 
 
-		if err = c.registerEvents(conn); err != nil {
-			conn.Close()
-			continue
-		}
+		log.Printf("gocql: unable to control conn dial %v: %v\n", addr, err)
+	}
 
 
-		// we should fetch the initial ring here and update initial host data. So that
-		// when we return from here we have a ring topology ready to go.
-		break
+	return
+}
+
+func (c *controlConn) connect(endpoints []string) error {
+	conn, err := c.shuffleDial(endpoints)
+	if err != nil {
+		return err
+	} else if conn == nil {
+		return errors.New("gocql: unable to connect to initial endpoints")
 	}
 	}
 
 
-	if conn == nil {
-		// this is fatal, not going to connect a session
+	if err := c.setupConn(conn); err != nil {
+		conn.Close()
 		return err
 		return err
 	}
 	}
 
 
-	c.conn.Store(conn)
-	atomic.StoreInt64(&c.connecting, 0)
+	// we could fetch the initial ring here and update initial host data. So that
+	// when we return from here we have a ring topology ready to go.
 
 
-	c.closeWg.Add(1)
 	go c.heartBeat()
 	go c.heartBeat()
 
 
 	return nil
 	return nil
 }
 }
 
 
+func (c *controlConn) setupConn(conn *Conn) error {
+	if err := c.registerEvents(conn); err != nil {
+		conn.Close()
+		return err
+	}
+
+	c.conn.Store(conn)
+
+	host, portstr, err := net.SplitHostPort(conn.conn.RemoteAddr().String())
+	if err != nil {
+		return err
+	}
+	port, err := strconv.Atoi(portstr)
+	if err != nil {
+		return err
+	}
+
+	go c.session.handleNodeUp(net.ParseIP(host), port, true)
+
+	return nil
+}
+
 func (c *controlConn) registerEvents(conn *Conn) error {
 func (c *controlConn) registerEvents(conn *Conn) error {
 	var events []string
 	var events []string
 
 
@@ -159,22 +190,6 @@ func (c *controlConn) registerEvents(conn *Conn) error {
 func (c *controlConn) reconnect(refreshring bool) {
 func (c *controlConn) reconnect(refreshring bool) {
 	// TODO: simplify this function, use session.ring to get hosts instead of the
 	// TODO: simplify this function, use session.ring to get hosts instead of the
 	// connection pool
 	// connection pool
-	if !atomic.CompareAndSwapInt64(&c.connecting, 0, 1) {
-		return
-	}
-
-	success := false
-	defer func() {
-		// debounce reconnect a little
-		if success {
-			go func() {
-				time.Sleep(500 * time.Millisecond)
-				atomic.StoreInt64(&c.connecting, 0)
-			}()
-		} else {
-			atomic.StoreInt64(&c.connecting, 0)
-		}
-	}()
 
 
 	addr := c.addr()
 	addr := c.addr()
 	oldConn := c.conn.Load().(*Conn)
 	oldConn := c.conn.Load().(*Conn)
@@ -202,10 +217,7 @@ func (c *controlConn) reconnect(refreshring bool) {
 	if newConn == nil {
 	if newConn == nil {
 		_, conn := c.session.pool.Pick(nil)
 		_, conn := c.session.pool.Pick(nil)
 		if conn == nil {
 		if conn == nil {
-			return
-		}
-
-		if conn == nil {
+			c.connect(c.session.ring.endpoints)
 			return
 			return
 		}
 		}
 
 
@@ -217,16 +229,12 @@ func (c *controlConn) reconnect(refreshring bool) {
 		}
 		}
 	}
 	}
 
 
-	if err := c.registerEvents(newConn); err != nil {
-		// TODO: handle this case better
+	if err := c.setupConn(newConn); err != nil {
 		newConn.Close()
 		newConn.Close()
 		log.Printf("gocql: control unable to register events: %v\n", err)
 		log.Printf("gocql: control unable to register events: %v\n", err)
 		return
 		return
 	}
 	}
 
 
-	c.conn.Store(newConn)
-	success = true
-
 	if refreshring {
 	if refreshring {
 		c.session.hostSource.refreshRing()
 		c.session.hostSource.refreshRing()
 	}
 	}
@@ -355,9 +363,9 @@ func (c *controlConn) addr() string {
 }
 }
 
 
 func (c *controlConn) close() {
 func (c *controlConn) close() {
-	// TODO: handle more gracefully
-	close(c.quit)
-	c.closeWg.Wait()
+	if atomic.CompareAndSwapInt32(&c.started, 1, -1) {
+		c.quit <- struct{}{}
+	}
 	conn := c.conn.Load().(*Conn)
 	conn := c.conn.Load().(*Conn)
 	if conn != nil {
 	if conn != nil {
 		conn.Close()
 		conn.Close()

+ 53 - 0
events_ccm_test.go

@@ -236,3 +236,56 @@ func TestEventFilter(t *testing.T) {
 	}
 	}
 
 
 }
 }
+
+func TestEventDownQueryable(t *testing.T) {
+	if err := ccm.AllUp(); err != nil {
+		t.Fatal(err)
+	}
+
+	status, err := ccm.Status()
+	if err != nil {
+		t.Fatal(err)
+	}
+	log.Printf("status=%+v\n", status)
+
+	const targetNode = "node1"
+
+	addr := status[targetNode].Addr
+
+	cluster := createCluster()
+	cluster.Hosts = []string{addr}
+	cluster.HostFilter = WhiteListHostFilter(addr)
+	session := createSessionFromCluster(cluster, t)
+	defer session.Close()
+
+	if pool, ok := session.pool.getPool(addr); !ok {
+		t.Fatalf("should have %v in pool but dont", addr)
+	} else if !pool.host.IsUp() {
+		t.Fatalf("host is not up %v", pool.host)
+	}
+
+	if err := ccm.NodeDown(targetNode); err != nil {
+		t.Fatal(err)
+	}
+
+	time.Sleep(5 * time.Second)
+
+	if err := ccm.NodeUp(targetNode); err != nil {
+		t.Fatal(err)
+	}
+
+	time.Sleep(15 * time.Second)
+
+	if pool, ok := session.pool.getPool(addr); !ok {
+		t.Fatalf("should have %v in pool but dont", addr)
+	} else if !pool.host.IsUp() {
+		t.Fatalf("host is not up %v", pool.host)
+	}
+
+	var rows int
+	if err := session.Query("SELECT COUNT(*) FROM system.local").Scan(&rows); err != nil {
+		t.Fatal(err)
+	} else if rows != 1 {
+		t.Fatalf("expected to get 1 row got %d", rows)
+	}
+}

+ 4 - 0
session.go

@@ -259,6 +259,10 @@ func (s *Session) Close() {
 	if s.nodeEvents != nil {
 	if s.nodeEvents != nil {
 		s.nodeEvents.stop()
 		s.nodeEvents.stop()
 	}
 	}
+
+	if s.schemaEvents != nil {
+		s.schemaEvents.stop()
+	}
 }
 }
 
 
 func (s *Session) Closed() bool {
 func (s *Session) Closed() bool {