12 years ago · a0e96da4d1
--- a/broker.go
+++ b/broker.go
@@ -14,6 +14,7 @@ type Broker struct {
 
				 
			
 
				 	correlation_id int32
			
 
				 	conn           net.Conn
			
 
				+	conn_err       error
			
 
				 	lock           sync.Mutex
			
 
				 
			
 
				 	responses chan responsePromise
			
@@ -27,7 +28,7 @@ type responsePromise struct {
 
				 }
			
 
				 
			
 
				 // NewBroker creates and returns a Broker targetting the given host:port address.
			
 
				-// This does not attempt to actually connect, you have to call Connect() for that.
			
 
				+// This does not attempt to actually connect, you have to call Connect() or AsyncConnect() for that.
			
 
				 func NewBroker(host string, port int32) *Broker {
			
 
				 	b := new(Broker)
			
 
				 	b.id = -1 // don't know it yet
			
@@ -40,17 +41,39 @@ func (b *Broker) Connect() error {
 
				 	b.lock.Lock()
			
 
				 	defer b.lock.Unlock()
			
 
				 
			
 
				+	return b.connect()
			
 
				+}
			
 
				+
			
 
				+// AsyncConnect tries to connect to the Broker in a non-blocking way. Calling `broker.AsyncConnect()` is
			
 
				+// *NOT* the same as calling `go broker.Connect()` - AsyncConnect takes the broker lock synchronously before
			
 
				+// launching its goroutine, so that subsequent operations on the broker are guaranteed to block waiting for
			
 
				+// the connection instead of simply returning NotConnected. This does mean that if someone is already operating
			
 
				+// on the broker, AsyncConnect may not be truly asynchronous while it waits for the lock.
			
 
				+func (b *Broker) AsyncConnect() {
			
 
				+	b.lock.Lock()
			
 
				+
			
 
				+	go func() {
			
 
				+		defer b.lock.Unlock()
			
 
				+		b.connect()
			
 
				+	}()
			
 
				+
			
 
				+}
			
 
				+
			
 
				+func (b *Broker) connect() error {
			
 
				 	if b.conn != nil {
			
 
				 		return AlreadyConnected
			
 
				 	}
			
 
				+	b.conn_err = nil
			
 
				 
			
 
				 	addr, err := net.ResolveIPAddr("ip", b.host)
			
 
				 	if err != nil {
			
 
				+		b.conn_err = err
			
 
				 		return err
			
 
				 	}
			
 
				 
			
 
				 	b.conn, err = net.DialTCP("tcp", nil, &net.TCPAddr{IP: addr.IP, Port: int(b.port)})
			
 
				 	if err != nil {
			
 
				+		b.conn_err = err
			
 
				 		return err
			
 
				 	}
			
 
				 
			
@@ -78,6 +101,7 @@ func (b *Broker) Close() error {
 
				 	err := b.conn.Close()
			
 
				 
			
 
				 	b.conn = nil
			
 
				+	b.conn_err = nil
			
 
				 	b.done = nil
			
 
				 	b.responses = nil
			
 
				 
			
@@ -184,7 +208,11 @@ func (b *Broker) send(clientID string, req requestEncoder, promiseResponse bool)
 
				 	defer b.lock.Unlock()
			
 
				 
			
 
				 	if b.conn == nil {
			
 
				-		return nil, NotConnected
			
 
				+		if b.conn_err != nil {
			
 
				+			return nil, b.conn_err
			
 
				+		} else {
			
 
				+			return nil, NotConnected
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	fullRequest := request{b.correlation_id, clientID, req}
			
--- a/client.go
+++ b/client.go
@@ -177,7 +177,7 @@ func (client *Client) cachedLeader(topic string, partition_id int32) *Broker {
 
				 	partitions := client.leaders[topic]
			
 
				 	if partitions != nil {
			
 
				 		leader, ok := partitions[partition_id]
			
 
				-		if ok && leader != -1 {
			
 
				+		if ok {
			
 
				 			return client.brokers[leader]
			
 
				 		}
			
 
				 	}
			
@@ -205,34 +205,29 @@ func (client *Client) cachedPartitions(topic string) []int32 {
 
				 
			
 
				 // if no fatal error, returns a list of topics that need retrying due to LEADER_NOT_AVAILABLE
			
 
				 func (client *Client) update(data *MetadataResponse) ([]string, error) {
			
 
				+	client.lock.Lock()
			
 
				+	defer client.lock.Unlock()
			
 
				+
			
 
				 	// First discard brokers that we already know about. This avoids bouncing TCP connections,
			
 
				 	// and especially avoids closing valid connections out from under other code which may be trying
			
 
				-	// to use them. We only need a read-lock for this.
			
 
				+	// to use them.
			
 
				 	var newBrokers []*Broker
			
 
				-	client.lock.RLock()
			
 
				 	for _, broker := range data.Brokers {
			
 
				 		if !broker.Equals(client.brokers[broker.ID()]) {
			
 
				 			newBrokers = append(newBrokers, broker)
			
 
				 		}
			
 
				 	}
			
 
				-	client.lock.RUnlock()
			
 
				-
			
 
				-	// connect to the brokers before taking the write lock, as this can take a while
			
 
				-	// to timeout if one of them isn't reachable
			
 
				-	for _, broker := range newBrokers {
			
 
				-		err := broker.Connect()
			
 
				-		if err != nil {
			
 
				-			return nil, err
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	client.lock.Lock()
			
 
				-	defer client.lock.Unlock()
			
 
				 
			
 
				+	// Now asynchronously try to open connections to the new brokers. We don't care if they
			
 
				+	// fail, since maybe that broker is unreachable but doesn't have a topic we care about.
			
 
				+	// If it fails and we do care, whoever tries to use it will get the connection error.
			
 
				+	// If we have an old broker with that ID (but a different host/port, since they didn't
			
 
				+	// compare as equals above) then close and remove that broker before saving the new one.
			
 
				 	for _, broker := range newBrokers {
			
 
				 		if client.brokers[broker.ID()] != nil {
			
 
				 			go client.brokers[broker.ID()].Close()
			
 
				 		}
			
 
				+		broker.AsyncConnect()
			
 
				 		client.brokers[broker.ID()] = broker
			
 
				 	}
			
 
				 
			
@@ -251,11 +246,8 @@ func (client *Client) update(data *MetadataResponse) ([]string, error) {
 
				 		for _, partition := range topic.Partitions {
			
 
				 			switch partition.Err {
			
 
				 			case LEADER_NOT_AVAILABLE:
			
 
				-				// in the LEADER_NOT_AVAILABLE case partition.Leader will be -1 because the
			
 
				-				// partition is in the middle of leader election, so we fallthrough to save it
			
 
				-				// anyways in order to avoid returning the stale leader (since -1 isn't a valid broker ID)
			
 
				 				toRetry[topic.Name] = true
			
 
				-				fallthrough
			
 
				+				delete(client.leaders[topic.Name], partition.Id)
			
 
				 			case NO_ERROR:
			
 
				 				client.leaders[topic.Name][partition.Id] = partition.Leader
			
 
				 			default: