Browse Source

Merge branch 'master' into support-ssl

Conflicts:
	broker.go
	config.go
Andrea Lusuardi - uovobw 10 years ago
parent
commit
1016308070
60 changed files with 2902 additions and 898 deletions
  1. 9 4
      .travis.yml
  2. 106 0
      CHANGELOG.md
  3. 1 1
      Makefile
  4. 6 1
      Vagrantfile
  5. 272 237
      async_producer.go
  6. 137 18
      async_producer_test.go
  7. 10 6
      broker.go
  8. 241 113
      client.go
  9. 285 3
      client_test.go
  10. 6 0
      config.go
  11. 234 159
      consumer.go
  12. 5 0
      consumer_metadata_request.go
  13. 2 2
      consumer_metadata_request_test.go
  14. 33 8
      consumer_metadata_response.go
  15. 8 0
      consumer_metadata_response_test.go
  16. 204 27
      consumer_test.go
  17. 4 4
      encoder_decoder.go
  18. 53 0
      fetch_request.go
  19. 3 3
      fetch_request_test.go
  20. 90 0
      functional_client_test.go
  21. 61 0
      functional_consumer_test.go
  22. 203 0
      functional_producer_test.go
  23. 81 192
      functional_test.go
  24. 1 4
      message.go
  25. 20 0
      metadata_request.go
  26. 3 3
      metadata_request_test.go
  27. 1 0
      mockbroker_test.go
  28. 51 3
      mocks/consumer.go
  29. 58 1
      mocks/consumer_test.go
  30. 115 14
      offset_commit_request.go
  31. 64 8
      offset_commit_request_test.go
  32. 32 1
      offset_fetch_request.go
  33. 3 3
      offset_fetch_request_test.go
  34. 48 1
      offset_request.go
  35. 2 2
      offset_request_test.go
  36. 4 5
      prep_encoder.go
  37. 50 0
      produce_request.go
  38. 3 3
      produce_request_test.go
  39. 6 6
      real_decoder.go
  40. 4 4
      real_encoder.go
  41. 75 4
      request.go
  42. 22 15
      request_test.go
  43. 4 3
      sarama.go
  44. 12 8
      snappy.go
  45. 2 4
      snappy_test.go
  46. 4 1
      tools/README.md
  47. 2 0
      tools/kafka-console-consumer/.gitignore
  48. 29 0
      tools/kafka-console-consumer/README.md
  49. 145 0
      tools/kafka-console-consumer/kafka-console-consumer.go
  50. 4 1
      tools/kafka-console-partitionconsumer/README.md
  51. 1 1
      tools/kafka-console-producer/README.md
  52. 4 2
      utils.go
  53. 12 3
      vagrant/boot_cluster.sh
  54. 4 7
      vagrant/create_topics.sh
  55. 15 3
      vagrant/install_cluster.sh
  56. 22 0
      vagrant/run_toxiproxy.sh
  57. 8 6
      vagrant/server.properties
  58. 10 2
      vagrant/setup_services.sh
  59. 6 0
      vagrant/toxiproxy.conf
  60. 2 2
      vagrant/zookeeper.conf

+ 9 - 4
.travis.yml

@@ -1,14 +1,15 @@
 language: go
 go:
-- 1.3
-- 1.4
-- tip
+- 1.3.3
+- 1.4.2
 
 env:
   global:
-  - KAFKA_PEERS=localhost:6667,localhost:6668,localhost:6669,localhost:6670,localhost:6671
+  - KAFKA_PEERS=localhost:9091,localhost:9092,localhost:9093,localhost:9094,localhost:9095
+  - TOXIPROXY_ADDR=http://localhost:8474
   - KAFKA_INSTALL_ROOT=/home/travis/kafka
   - KAFKA_HOSTNAME=localhost
+  - DEBUG=true
   matrix:
   - KAFKA_VERSION=0.8.1.1
   - KAFKA_VERSION=0.8.2.1
@@ -30,7 +31,11 @@ script:
 - make fmt
 
 matrix:
+  include:
+    - go: tip
+      env: KAFKA_VERSION=0.8.2.1
   allow_failures:
     - go: tip
+  fast_finish: true
 
 sudo: false

+ 106 - 0
CHANGELOG.md

@@ -1,5 +1,111 @@
 # Changelog
 
+#### Version 1.4.3 (2015-07-21)
+
+Bug Fixes:
+ - Don't include the partitioner in the producer's "fetch partitions"
+   circuit-breaker ([#466](https://github.com/Shopify/sarama/pull/466)).
+ - Don't retry messages until the broker is closed when abandoning a broker in
+   the producer ([#468](https://github.com/Shopify/sarama/pull/468)).
+ - Update the import path for snappy-go, it has moved again and the API has
+   changed slightly ([#486](https://github.com/Shopify/sarama/pull/486)).
+
+#### Version 1.4.2 (2015-05-27)
+
+Bug Fixes:
+ - Update the import path for snappy-go, it has moved from google code to github
+   ([#456](https://github.com/Shopify/sarama/pull/456)).
+
+#### Version 1.4.1 (2015-05-25)
+
+Improvements:
+ - Optimizations when decoding snappy messages, thanks to John Potocny
+   ([#446](https://github.com/Shopify/sarama/pull/446)).
+
+Bug Fixes:
+ - Fix hypothetical race conditions on producer shutdown
+   ([#450](https://github.com/Shopify/sarama/pull/450),
+   [#451](https://github.com/Shopify/sarama/pull/451)).
+
+#### Version 1.4.0 (2015-05-01)
+
+New Features:
+ - The consumer now implements `Topics()` and `Partitions()` methods to enable
+   users to dynamically choose what topics/partitions to consume without
+   instantiating a full client
+   ([#431](https://github.com/Shopify/sarama/pull/431)).
+ - The partition-consumer now exposes the high water mark offset value returned
+   by the broker via the `HighWaterMarkOffset()` method ([#339](https://github.com/Shopify/sarama/pull/339)).
+ - Added a `kafka-console-consumer` tool capable of handling multiple
+   partitions, and deprecated the now-obsolete `kafka-console-partitionConsumer`
+   ([#439](https://github.com/Shopify/sarama/pull/439),
+   [#442](https://github.com/Shopify/sarama/pull/442)).
+
+Improvements:
+ - The producer's logging during retry scenarios is more consistent, more
+   useful, and slightly less verbose
+   ([#429](https://github.com/Shopify/sarama/pull/429)).
+ - The client now shuffles its initial list of seed brokers in order to prevent
+   thundering herd on the first broker in the list
+   ([#441](https://github.com/Shopify/sarama/pull/441)).
+
+Bug Fixes:
+ - The producer now correctly manages its state if retries occur when it is
+   shutting down, fixing several instances of confusing behaviour and at least
+   one potential deadlock ([#419](https://github.com/Shopify/sarama/pull/419)).
+ - The consumer now handles messages for different partitions asynchronously,
+   making it much more resilient to specific user code ordering
+   ([#325](https://github.com/Shopify/sarama/pull/325)).
+
+#### Version 1.3.0 (2015-04-16)
+
+New Features:
+ - The client now tracks consumer group coordinators using
+   ConsumerMetadataRequests similar to how it tracks partition leadership using
+   regular MetadataRequests ([#411](https://github.com/Shopify/sarama/pull/411)).
+   This adds two methods to the client API:
+   - `Coordinator(consumerGroup string) (*Broker, error)`
+   - `RefreshCoordinator(consumerGroup string) error`
+
+Improvements:
+ - ConsumerMetadataResponses now automatically create a Broker object out of the
+   ID/address/port combination for the Coordinator; accessing the fields
+   individually has been deprecated
+   ([#413](https://github.com/Shopify/sarama/pull/413)).
+ - Much improved handling of `OffsetOutOfRange` errors in the consumer.
+   Consumers will fail to start if the provided offset is out of range
+   ([#418](https://github.com/Shopify/sarama/pull/418))
+   and they will automatically shut down if the offset falls out of range
+   ([#424](https://github.com/Shopify/sarama/pull/424)).
+ - Small performance improvement in encoding and decoding protocol messages
+   ([#427](https://github.com/Shopify/sarama/pull/427)).
+
+Bug Fixes:
+ - Fix a rare race condition in the client's background metadata refresher if
+   it happens to be activated while the client is being closed
+   ([#422](https://github.com/Shopify/sarama/pull/422)).
+
+#### Version 1.2.0 (2015-04-07)
+
+Improvements:
+ - The producer's behaviour when `Flush.Frequency` is set is now more intuitive
+   ([#389](https://github.com/Shopify/sarama/pull/389)).
+ - The producer is now somewhat more memory-efficient during and after retrying
+   messages due to an improved queue implementation
+   ([#396](https://github.com/Shopify/sarama/pull/396)).
+ - The consumer produces much more useful logging output when leadership
+   changes ([#385](https://github.com/Shopify/sarama/pull/385)).
+ - The client's `GetOffset` method will now automatically refresh metadata and
+   retry once in the event of stale information or similar
+   ([#394](https://github.com/Shopify/sarama/pull/394)).
+ - Broker connections now have support for using TCP keepalives
+   ([#407](https://github.com/Shopify/sarama/issues/407)).
+
+Bug Fixes:
+ - The OffsetCommitRequest message now correctly implements all three possible
+   API versions ([#390](https://github.com/Shopify/sarama/pull/390),
+   [#400](https://github.com/Shopify/sarama/pull/400)).
+
 #### Version 1.1.0 (2015-03-20)
 
 Improvements:

+ 1 - 1
Makefile

@@ -21,4 +21,4 @@ install_go_vet:
 	go get golang.org/x/tools/cmd/vet
 
 get:
-	go get
+	go get -t

+ 6 - 1
Vagrantfile

@@ -4,6 +4,8 @@
 # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
 VAGRANTFILE_API_VERSION = "2"
 
+MEMORY = 3072
+
 Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
   config.vm.box = "hashicorp/precise64"
 
@@ -12,6 +14,9 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
   config.vm.network "private_network", ip: "192.168.100.67"
 
   config.vm.provider "vmware_fusion" do |v|
-    v.vmx["memsize"] = "3072"
+    v.vmx["memsize"] = MEMORY.to_s
+  end
+  config.vm.provider "virtualbox" do |v|
+    v.memory = MEMORY
   end
 end

+ 272 - 237
async_producer.go

@@ -6,6 +6,7 @@ import (
 	"time"
 
 	"github.com/eapache/go-resiliency/breaker"
+	"github.com/eapache/queue"
 )
 
 func forceFlushThreshold() int {
@@ -35,7 +36,7 @@ type AsyncProducer interface {
 	// Input is the input channel for the user to write messages to that they wish to send.
 	Input() chan<- *ProducerMessage
 
-	// Successes is the success output channel back to the user when AckSuccesses is confured.
+	// Successes is the success output channel back to the user when AckSuccesses is enabled.
 	// If Return.Successes is true, you MUST read from this channel or the Producer will deadlock.
 	// It is suggested that you send and read messages together in a single select statement.
 	Successes() <-chan *ProducerMessage
@@ -53,6 +54,7 @@ type asyncProducer struct {
 
 	errors                    chan *ProducerError
 	input, successes, retries chan *ProducerMessage
+	inFlight                  sync.WaitGroup
 
 	brokers    map[*Broker]chan *ProducerMessage
 	brokerRefs map[chan *ProducerMessage]int
@@ -94,7 +96,7 @@ func NewAsyncProducerFromClient(client Client) (AsyncProducer, error) {
 	}
 
 	// launch our singleton dispatchers
-	go withRecover(p.topicDispatcher)
+	go withRecover(p.dispatcher)
 	go withRecover(p.retryHandler)
 
 	return p, nil
@@ -104,8 +106,6 @@ type flagSet int8
 
 const (
 	chaser   flagSet = 1 << iota // message is last in a group that failed
-	ref                          // add a reference to a singleton channel
-	unref                        // remove a reference from a singleton channel
 	shutdown                     // start the shutdown process
 )
 
@@ -136,6 +136,11 @@ func (m *ProducerMessage) byteSize() int {
 	return size
 }
 
+func (m *ProducerMessage) clear() {
+	m.flags = 0
+	m.retries = 0
+}
+
 // ProducerError is the type of error generated when the producer fails to deliver a message.
 // It contains the original ProducerMessage as well as the actual error value.
 type ProducerError struct {
@@ -192,22 +197,14 @@ func (p *asyncProducer) Close() error {
 }
 
 func (p *asyncProducer) AsyncClose() {
-	go withRecover(func() {
-		p.input <- &ProducerMessage{flags: shutdown}
-	})
+	go withRecover(p.shutdown)
 }
 
-///////////////////////////////////////////
-// In normal processing, a message flows through the following functions from top to bottom,
-// starting at topicDispatcher (which reads from Producer.input) and ending in flusher
-// (which sends the message to the broker). In cases where a message must be retried, it goes
-// through retryHandler before being returned to the top of the flow.
-///////////////////////////////////////////
-
 // singleton
 // dispatches messages by topic
-func (p *asyncProducer) topicDispatcher() {
+func (p *asyncProducer) dispatcher() {
 	handlers := make(map[string]chan *ProducerMessage)
+	shuttingDown := false
 
 	for msg := range p.input {
 		if msg == nil {
@@ -216,8 +213,22 @@ func (p *asyncProducer) topicDispatcher() {
 		}
 
 		if msg.flags&shutdown != 0 {
-			Logger.Println("Producer shutting down.")
-			break
+			shuttingDown = true
+			p.inFlight.Done()
+			continue
+		} else if msg.retries == 0 {
+			if shuttingDown {
+				// we can't just call returnError here because that decrements the wait group,
+				// which hasn't been incremented yet for this message, and shouldn't be
+				pErr := &ProducerError{Msg: msg, Err: ErrShuttingDown}
+				if p.conf.Producer.Return.Errors {
+					p.errors <- pErr
+				} else {
+					Logger.Println(pErr)
+				}
+				continue
+			}
+			p.inFlight.Add(1)
 		}
 
 		if (p.conf.Producer.Compression == CompressionNone && msg.Value != nil && msg.Value.Length() > p.conf.Producer.MaxMessageBytes) ||
@@ -229,11 +240,8 @@ func (p *asyncProducer) topicDispatcher() {
 
 		handler := handlers[msg.Topic]
 		if handler == nil {
-			p.retries <- &ProducerMessage{flags: ref}
-			newHandler := make(chan *ProducerMessage, p.conf.ChannelBufferSize)
-			topic := msg.Topic // block local because go's closure semantics suck
-			go withRecover(func() { p.partitionDispatcher(topic, newHandler) })
-			handler = newHandler
+			handler = make(chan *ProducerMessage, p.conf.ChannelBufferSize)
+			p.newTopicProducer(msg.Topic, handler)
 			handlers[msg.Topic] = handler
 		}
 
@@ -243,155 +251,162 @@ func (p *asyncProducer) topicDispatcher() {
 	for _, handler := range handlers {
 		close(handler)
 	}
-
-	p.retries <- &ProducerMessage{flags: shutdown}
-
-	for msg := range p.input {
-		p.returnError(msg, ErrShuttingDown)
-	}
-
-	if p.ownClient {
-		err := p.client.Close()
-		if err != nil {
-			Logger.Println("producer/shutdown failed to close the embedded client:", err)
-		}
-	}
-	close(p.errors)
-	close(p.successes)
 }
 
 // one per topic
 // partitions messages, then dispatches them by partition
-func (p *asyncProducer) partitionDispatcher(topic string, input chan *ProducerMessage) {
-	handlers := make(map[int32]chan *ProducerMessage)
-	partitioner := p.conf.Producer.Partitioner(topic)
-	breaker := breaker.New(3, 1, 10*time.Second)
+type topicProducer struct {
+	parent *asyncProducer
+	topic  string
+	input  <-chan *ProducerMessage
+
+	breaker     *breaker.Breaker
+	handlers    map[int32]chan *ProducerMessage
+	partitioner Partitioner
+}
+
+func (p *asyncProducer) newTopicProducer(topic string, input <-chan *ProducerMessage) *topicProducer {
+	tp := &topicProducer{
+		parent:      p,
+		topic:       topic,
+		input:       input,
+		breaker:     breaker.New(3, 1, 10*time.Second),
+		handlers:    make(map[int32]chan *ProducerMessage),
+		partitioner: p.conf.Producer.Partitioner(topic),
+	}
+	go withRecover(tp.dispatch)
+	return tp
+}
 
-	for msg := range input {
+func (tp *topicProducer) dispatch() {
+	for msg := range tp.input {
 		if msg.retries == 0 {
-			err := breaker.Run(func() error {
-				return p.assignPartition(partitioner, msg)
-			})
-			if err != nil {
-				p.returnError(msg, err)
+			if err := tp.partitionMessage(msg); err != nil {
+				tp.parent.returnError(msg, err)
 				continue
 			}
 		}
 
-		handler := handlers[msg.Partition]
+		handler := tp.handlers[msg.Partition]
 		if handler == nil {
-			p.retries <- &ProducerMessage{flags: ref}
-			newHandler := make(chan *ProducerMessage, p.conf.ChannelBufferSize)
-			topic := msg.Topic         // block local because go's closure semantics suck
-			partition := msg.Partition // block local because go's closure semantics suck
-			go withRecover(func() { p.leaderDispatcher(topic, partition, newHandler) })
-			handler = newHandler
-			handlers[msg.Partition] = handler
+			handler = make(chan *ProducerMessage, tp.parent.conf.ChannelBufferSize)
+			tp.parent.newPartitionProducer(msg.Topic, msg.Partition, handler)
+			tp.handlers[msg.Partition] = handler
 		}
 
 		handler <- msg
 	}
 
-	for _, handler := range handlers {
+	for _, handler := range tp.handlers {
 		close(handler)
 	}
-	p.retries <- &ProducerMessage{flags: unref}
 }
 
-// one per partition per topic
-// dispatches messages to the appropriate broker
-// also responsible for maintaining message order during retries
-func (p *asyncProducer) leaderDispatcher(topic string, partition int32, input chan *ProducerMessage) {
-	var leader *Broker
-	var output chan *ProducerMessage
+func (tp *topicProducer) partitionMessage(msg *ProducerMessage) error {
+	var partitions []int32
 
-	breaker := breaker.New(3, 1, 10*time.Second)
-	doUpdate := func() (err error) {
-		if err = p.client.RefreshMetadata(topic); err != nil {
-			return err
+	err := tp.breaker.Run(func() (err error) {
+		if tp.partitioner.RequiresConsistency() {
+			partitions, err = tp.parent.client.Partitions(msg.Topic)
+		} else {
+			partitions, err = tp.parent.client.WritablePartitions(msg.Topic)
 		}
+		return
+	})
 
-		if leader, err = p.client.Leader(topic, partition); err != nil {
-			return err
-		}
+	if err != nil {
+		return err
+	}
 
-		output = p.getBrokerProducer(leader)
-		return nil
+	numPartitions := int32(len(partitions))
+
+	if numPartitions == 0 {
+		return ErrLeaderNotAvailable
 	}
 
-	// try to prefetch the leader; if this doesn't work, we'll do a proper breaker-protected refresh-and-fetch
-	// on the first message
-	leader, _ = p.client.Leader(topic, partition)
-	if leader != nil {
-		output = p.getBrokerProducer(leader)
+	choice, err := tp.partitioner.Partition(msg, numPartitions)
+
+	if err != nil {
+		return err
+	} else if choice < 0 || choice >= numPartitions {
+		return ErrInvalidPartition
 	}
 
+	msg.Partition = partitions[choice]
+
+	return nil
+}
+
+// one per partition per topic
+// dispatches messages to the appropriate broker
+// also responsible for maintaining message order during retries
+type partitionProducer struct {
+	parent    *asyncProducer
+	topic     string
+	partition int32
+	input     <-chan *ProducerMessage
+
+	leader  *Broker
+	breaker *breaker.Breaker
+	output  chan *ProducerMessage
+
 	// highWatermark tracks the "current" retry level, which is the only one where we actually let messages through,
 	// all other messages get buffered in retryState[msg.retries].buf to preserve ordering
 	// retryState[msg.retries].expectChaser simply tracks whether we've seen a chaser message for a given level (and
 	// therefore whether our buffer is complete and safe to flush)
-	highWatermark := 0
-	retryState := make([]struct {
-		buf          []*ProducerMessage
-		expectChaser bool
-	}, p.conf.Producer.Retry.Max+1)
-
-	for msg := range input {
-		if msg.retries > highWatermark {
-			// new, higher, retry level; send off a chaser so that we know when everything "in between" has made it
-			// back to us and we can safely flush the backlog (otherwise we risk re-ordering messages)
-			highWatermark = msg.retries
-			Logger.Printf("producer/leader state change to [retrying-%d] on %s/%d\n", highWatermark, topic, partition)
-			retryState[msg.retries].expectChaser = true
-			output <- &ProducerMessage{Topic: topic, Partition: partition, flags: chaser, retries: msg.retries - 1}
-			Logger.Printf("producer/leader abandoning broker %d on %s/%d\n", leader.ID(), topic, partition)
-			p.unrefBrokerProducer(leader, output)
-			output = nil
-			time.Sleep(p.conf.Producer.Retry.Backoff)
-		} else if highWatermark > 0 {
+	highWatermark int
+	retryState    []partitionRetryState
+}
+
+type partitionRetryState struct {
+	buf          []*ProducerMessage
+	expectChaser bool
+}
+
+func (p *asyncProducer) newPartitionProducer(topic string, partition int32, input <-chan *ProducerMessage) *partitionProducer {
+	pp := &partitionProducer{
+		parent:    p,
+		topic:     topic,
+		partition: partition,
+		input:     input,
+
+		breaker:    breaker.New(3, 1, 10*time.Second),
+		retryState: make([]partitionRetryState, p.conf.Producer.Retry.Max+1),
+	}
+	go withRecover(pp.dispatch)
+	return pp
+}
+
+func (pp *partitionProducer) dispatch() {
+	// try to prefetch the leader; if this doesn't work, we'll do a proper call to `updateLeader`
+	// on the first message
+	pp.leader, _ = pp.parent.client.Leader(pp.topic, pp.partition)
+	if pp.leader != nil {
+		pp.output = pp.parent.getBrokerProducer(pp.leader)
+	}
+
+	for msg := range pp.input {
+		if msg.retries > pp.highWatermark {
+			// a new, higher, retry level; handle it and then back off
+			pp.newHighWatermark(msg.retries)
+			time.Sleep(pp.parent.conf.Producer.Retry.Backoff)
+		} else if pp.highWatermark > 0 {
 			// we are retrying something (else highWatermark would be 0) but this message is not a *new* retry level
-			if msg.retries < highWatermark {
+			if msg.retries < pp.highWatermark {
 				// in fact this message is not even the current retry level, so buffer it for now (unless it's a just a chaser)
 				if msg.flags&chaser == chaser {
-					retryState[msg.retries].expectChaser = false
+					pp.retryState[msg.retries].expectChaser = false
+					pp.parent.inFlight.Done() // this chaser is now handled and will be garbage collected
 				} else {
-					retryState[msg.retries].buf = append(retryState[msg.retries].buf, msg)
+					pp.retryState[msg.retries].buf = append(pp.retryState[msg.retries].buf, msg)
 				}
 				continue
 			} else if msg.flags&chaser == chaser {
 				// this message is of the current retry level (msg.retries == highWatermark) and the chaser flag is set,
 				// meaning this retry level is done and we can go down (at least) one level and flush that
-				retryState[highWatermark].expectChaser = false
-				Logger.Printf("producer/leader state change to [normal-%d] on %s/%d\n", highWatermark, topic, partition)
-				for {
-					highWatermark--
-					Logger.Printf("producer/leader state change to [flushing-%d] on %s/%d\n", highWatermark, topic, partition)
-
-					if output == nil {
-						if err := breaker.Run(doUpdate); err != nil {
-							p.returnErrors(retryState[highWatermark].buf, err)
-							goto flushDone
-						}
-						Logger.Printf("producer/leader selected broker %d on %s/%d\n", leader.ID(), topic, partition)
-					}
-
-					for _, msg := range retryState[highWatermark].buf {
-						output <- msg
-					}
-
-				flushDone:
-					retryState[highWatermark].buf = nil
-					if retryState[highWatermark].expectChaser {
-						Logger.Printf("producer/leader state change to [retrying-%d] on %s/%d\n", highWatermark, topic, partition)
-						break
-					} else {
-						Logger.Printf("producer/leader state change to [normal-%d] on %s/%d\n", highWatermark, topic, partition)
-						if highWatermark == 0 {
-							break
-						}
-					}
-
-				}
+				pp.retryState[pp.highWatermark].expectChaser = false
+				pp.flushRetryBuffers()
+				pp.parent.inFlight.Done() // this chaser is now handled and will be garbage collected
 				continue
 			}
 		}
@@ -399,46 +414,101 @@ func (p *asyncProducer) leaderDispatcher(topic string, partition int32, input ch
 		// if we made it this far then the current msg contains real data, and can be sent to the next goroutine
 		// without breaking any of our ordering guarantees
 
-		if output == nil {
-			if err := breaker.Run(doUpdate); err != nil {
-				p.returnError(msg, err)
-				time.Sleep(p.conf.Producer.Retry.Backoff)
+		if pp.output == nil {
+			if err := pp.updateLeader(); err != nil {
+				pp.parent.returnError(msg, err)
+				time.Sleep(pp.parent.conf.Producer.Retry.Backoff)
 				continue
 			}
-			Logger.Printf("producer/leader selected broker %d on %s/%d\n", leader.ID(), topic, partition)
+			Logger.Printf("producer/leader/%s/%d selected broker %d\n", pp.topic, pp.partition, pp.leader.ID())
 		}
 
-		output <- msg
+		pp.output <- msg
+	}
+
+	if pp.output != nil {
+		pp.parent.unrefBrokerProducer(pp.leader, pp.output)
 	}
+}
+
+func (pp *partitionProducer) newHighWatermark(hwm int) {
+	Logger.Printf("producer/leader/%s/%d state change to [retrying-%d]\n", pp.topic, pp.partition, hwm)
+	pp.highWatermark = hwm
+
+	// send off a chaser so that we know when everything "in between" has made it
+	// back to us and we can safely flush the backlog (otherwise we risk re-ordering messages)
+	pp.retryState[pp.highWatermark].expectChaser = true
+	pp.parent.inFlight.Add(1) // we're generating a chaser message; track it so we don't shut down while it's still inflight
+	pp.output <- &ProducerMessage{Topic: pp.topic, Partition: pp.partition, flags: chaser, retries: pp.highWatermark - 1}
+
+	// a new HWM means that our current broker selection is out of date
+	Logger.Printf("producer/leader/%s/%d abandoning broker %d\n", pp.topic, pp.partition, pp.leader.ID())
+	pp.parent.unrefBrokerProducer(pp.leader, pp.output)
+	pp.output = nil
+}
+
+func (pp *partitionProducer) flushRetryBuffers() {
+	Logger.Printf("producer/leader/%s/%d state change to [flushing-%d]\n", pp.topic, pp.partition, pp.highWatermark)
+	for {
+		pp.highWatermark--
 
-	if output != nil {
-		p.unrefBrokerProducer(leader, output)
+		if pp.output == nil {
+			if err := pp.updateLeader(); err != nil {
+				pp.parent.returnErrors(pp.retryState[pp.highWatermark].buf, err)
+				goto flushDone
+			}
+			Logger.Printf("producer/leader/%s/%d selected broker %d\n", pp.topic, pp.partition, pp.leader.ID())
+		}
+
+		for _, msg := range pp.retryState[pp.highWatermark].buf {
+			pp.output <- msg
+		}
+
+	flushDone:
+		pp.retryState[pp.highWatermark].buf = nil
+		if pp.retryState[pp.highWatermark].expectChaser {
+			Logger.Printf("producer/leader/%s/%d state change to [retrying-%d]\n", pp.topic, pp.partition, pp.highWatermark)
+			break
+		} else if pp.highWatermark == 0 {
+			Logger.Printf("producer/leader/%s/%d state change to [normal]\n", pp.topic, pp.partition)
+			break
+		}
 	}
-	p.retries <- &ProducerMessage{flags: unref}
+}
+
+func (pp *partitionProducer) updateLeader() error {
+	return pp.breaker.Run(func() (err error) {
+		if err = pp.parent.client.RefreshMetadata(pp.topic); err != nil {
+			return err
+		}
+
+		if pp.leader, err = pp.parent.client.Leader(pp.topic, pp.partition); err != nil {
+			return err
+		}
+
+		pp.output = pp.parent.getBrokerProducer(pp.leader)
+		return nil
+	})
 }
 
 // one per broker
 // groups messages together into appropriately-sized batches for sending to the broker
 // based on https://godoc.org/github.com/eapache/channels#BatchingChannel
-func (p *asyncProducer) messageAggregator(broker *Broker, input chan *ProducerMessage) {
-	var ticker *time.Ticker
-	var timer <-chan time.Time
-	if p.conf.Producer.Flush.Frequency > 0 {
-		ticker = time.NewTicker(p.conf.Producer.Flush.Frequency)
-		timer = ticker.C
-	}
-
-	var buffer []*ProducerMessage
-	var doFlush chan []*ProducerMessage
-	var bytesAccumulated int
-	var defaultFlush bool
+func (p *asyncProducer) messageAggregator(broker *Broker, input <-chan *ProducerMessage) {
+	var (
+		timer            <-chan time.Time
+		buffer           []*ProducerMessage
+		flushTriggered   chan []*ProducerMessage
+		bytesAccumulated int
+		defaultFlush     bool
+	)
 
 	if p.conf.Producer.Flush.Frequency == 0 && p.conf.Producer.Flush.Bytes == 0 && p.conf.Producer.Flush.Messages == 0 {
 		defaultFlush = true
 	}
 
-	flusher := make(chan []*ProducerMessage)
-	go withRecover(func() { p.flusher(broker, flusher) })
+	output := make(chan []*ProducerMessage)
+	go withRecover(func() { p.flusher(broker, output) })
 
 	for {
 		select {
@@ -450,10 +520,11 @@ func (p *asyncProducer) messageAggregator(broker *Broker, input chan *ProducerMe
 			if (bytesAccumulated+msg.byteSize() >= forceFlushThreshold()) ||
 				(p.conf.Producer.Compression != CompressionNone && bytesAccumulated+msg.byteSize() >= p.conf.Producer.MaxMessageBytes) ||
 				(p.conf.Producer.Flush.MaxMessages > 0 && len(buffer) >= p.conf.Producer.Flush.MaxMessages) {
-				Logger.Println("producer/aggregator maximum request accumulated, forcing blocking flush")
-				flusher <- buffer
+				Logger.Printf("producer/aggregator/%d maximum request accumulated, forcing blocking flush\n", broker.ID())
+				output <- buffer
+				timer = nil
 				buffer = nil
-				doFlush = nil
+				flushTriggered = nil
 				bytesAccumulated = 0
 			}
 
@@ -464,30 +535,30 @@ func (p *asyncProducer) messageAggregator(broker *Broker, input chan *ProducerMe
 				msg.flags&chaser == chaser ||
 				(p.conf.Producer.Flush.Messages > 0 && len(buffer) >= p.conf.Producer.Flush.Messages) ||
 				(p.conf.Producer.Flush.Bytes > 0 && bytesAccumulated >= p.conf.Producer.Flush.Bytes) {
-				doFlush = flusher
+				flushTriggered = output
+			} else if p.conf.Producer.Flush.Frequency > 0 && timer == nil {
+				timer = time.After(p.conf.Producer.Flush.Frequency)
 			}
 		case <-timer:
-			doFlush = flusher
-		case doFlush <- buffer:
+			flushTriggered = output
+		case flushTriggered <- buffer:
+			timer = nil
 			buffer = nil
-			doFlush = nil
+			flushTriggered = nil
 			bytesAccumulated = 0
 		}
 	}
 
 shutdown:
-	if ticker != nil {
-		ticker.Stop()
-	}
 	if len(buffer) > 0 {
-		flusher <- buffer
+		output <- buffer
 	}
-	close(flusher)
+	close(output)
 }
 
 // one per broker
 // takes a batch at a time from the messageAggregator and sends to the broker
-func (p *asyncProducer) flusher(broker *Broker, input chan []*ProducerMessage) {
+func (p *asyncProducer) flusher(broker *Broker, input <-chan []*ProducerMessage) {
 	var closing error
 	currentRetries := make(map[string]map[int32]error)
 	Logger.Printf("producer/flusher/%d starting up\n", broker.ID())
@@ -538,17 +609,15 @@ func (p *asyncProducer) flusher(broker *Broker, input chan []*ProducerMessage) {
 		default:
 			Logger.Printf("producer/flusher/%d state change to [closing] because %s\n", broker.ID(), err)
 			p.abandonBrokerConnection(broker)
-			p.retryMessages(batch, err)
 			_ = broker.Close()
 			closing = err
+			p.retryMessages(batch, err)
 			continue
 		}
 
 		if response == nil {
 			// this only happens when RequiredAcks is NoResponse, so we have to assume success
-			if p.conf.Producer.Return.Successes {
-				p.returnSuccesses(batch)
-			}
+			p.returnSuccesses(batch)
 			continue
 		}
 
@@ -566,12 +635,10 @@ func (p *asyncProducer) flusher(broker *Broker, input chan []*ProducerMessage) {
 				switch block.Err {
 				case ErrNoError:
 					// All the messages for this topic-partition were delivered successfully!
-					if p.conf.Producer.Return.Successes {
-						for i := range msgs {
-							msgs[i].Offset = block.Offset + int64(i)
-						}
-						p.returnSuccesses(msgs)
+					for i := range msgs {
+						msgs[i].Offset = block.Offset + int64(i)
 					}
+					p.returnSuccesses(msgs)
 				case ErrUnknownTopicOrPartition, ErrNotLeaderForPartition, ErrLeaderNotAvailable,
 					ErrRequestTimedOut, ErrNotEnoughReplicas, ErrNotEnoughReplicasAfterAppend:
 					Logger.Printf("producer/flusher/%d state change to [retrying] on %s/%d because %v\n",
@@ -588,90 +655,55 @@ func (p *asyncProducer) flusher(broker *Broker, input chan []*ProducerMessage) {
 		}
 	}
 	Logger.Printf("producer/flusher/%d shut down\n", broker.ID())
-	p.retries <- &ProducerMessage{flags: unref}
 }
 
 // singleton
-// effectively a "bridge" between the flushers and the topicDispatcher in order to avoid deadlock
+// effectively a "bridge" between the flushers and the dispatcher in order to avoid deadlock
 // based on https://godoc.org/github.com/eapache/channels#InfiniteChannel
 func (p *asyncProducer) retryHandler() {
-	var buf []*ProducerMessage
 	var msg *ProducerMessage
-	refs := 0
-	shuttingDown := false
+	buf := queue.New()
 
 	for {
-		if len(buf) == 0 {
+		if buf.Length() == 0 {
 			msg = <-p.retries
 		} else {
 			select {
 			case msg = <-p.retries:
-			case p.input <- buf[0]:
-				buf = buf[1:]
+			case p.input <- buf.Peek().(*ProducerMessage):
+				buf.Remove()
 				continue
 			}
 		}
 
-		if msg.flags&ref != 0 {
-			refs++
-		} else if msg.flags&unref != 0 {
-			refs--
-			if refs == 0 && shuttingDown {
-				break
-			}
-		} else if msg.flags&shutdown != 0 {
-			shuttingDown = true
-			if refs == 0 {
-				break
-			}
-		} else {
-			buf = append(buf, msg)
+		if msg == nil {
+			return
 		}
-	}
 
-	close(p.retries)
-	for i := range buf {
-		p.input <- buf[i]
+		buf.Add(msg)
 	}
-	close(p.input)
 }
 
-///////////////////////////////////////////
-///////////////////////////////////////////
-
 // utility functions
 
-func (p *asyncProducer) assignPartition(partitioner Partitioner, msg *ProducerMessage) error {
-	var partitions []int32
-	var err error
-
-	if partitioner.RequiresConsistency() {
-		partitions, err = p.client.Partitions(msg.Topic)
-	} else {
-		partitions, err = p.client.WritablePartitions(msg.Topic)
-	}
-
-	if err != nil {
-		return err
-	}
-
-	numPartitions := int32(len(partitions))
+func (p *asyncProducer) shutdown() {
+	Logger.Println("Producer shutting down.")
+	p.inFlight.Add(1)
+	p.input <- &ProducerMessage{flags: shutdown}
 
-	if numPartitions == 0 {
-		return ErrLeaderNotAvailable
-	}
+	p.inFlight.Wait()
 
-	choice, err := partitioner.Partition(msg, numPartitions)
-
-	if err != nil {
-		return err
-	} else if choice < 0 || choice >= numPartitions {
-		return ErrInvalidPartition
+	if p.ownClient {
+		err := p.client.Close()
+		if err != nil {
+			Logger.Println("producer/shutdown failed to close the embedded client:", err)
+		}
 	}
 
-	msg.Partition = partitions[choice]
-
-	return nil
+	close(p.input)
+	close(p.retries)
+	close(p.errors)
+	close(p.successes)
 }
 
 func (p *asyncProducer) buildRequest(batch map[string]map[int32][]*ProducerMessage) *ProduceRequest {
@@ -737,14 +769,14 @@ func (p *asyncProducer) buildRequest(batch map[string]map[int32][]*ProducerMessa
 }
 
 func (p *asyncProducer) returnError(msg *ProducerMessage, err error) {
-	msg.flags = 0
-	msg.retries = 0
+	msg.clear()
 	pErr := &ProducerError{Msg: msg, Err: err}
 	if p.conf.Producer.Return.Errors {
 		p.errors <- pErr
 	} else {
 		Logger.Println(pErr)
 	}
+	p.inFlight.Done()
 }
 
 func (p *asyncProducer) returnErrors(batch []*ProducerMessage, err error) {
@@ -757,10 +789,14 @@ func (p *asyncProducer) returnErrors(batch []*ProducerMessage, err error) {
 
 func (p *asyncProducer) returnSuccesses(batch []*ProducerMessage) {
 	for _, msg := range batch {
-		if msg != nil {
-			msg.flags = 0
+		if msg == nil {
+			continue
+		}
+		if p.conf.Producer.Return.Successes {
+			msg.clear()
 			p.successes <- msg
 		}
+		p.inFlight.Done()
 	}
 }
 
@@ -785,7 +821,6 @@ func (p *asyncProducer) getBrokerProducer(broker *Broker) chan *ProducerMessage
 	bp := p.brokers[broker]
 
 	if bp == nil {
-		p.retries <- &ProducerMessage{flags: ref}
 		bp = make(chan *ProducerMessage)
 		p.brokers[broker] = bp
 		p.brokerRefs[bp] = 0

+ 137 - 18
async_producer_test.go

@@ -1,11 +1,13 @@
 package sarama
 
 import (
+	"errors"
 	"log"
 	"os"
 	"os/signal"
 	"sync"
 	"testing"
+	"time"
 )
 
 const TestMessage = "ABC THE MESSAGE"
@@ -30,22 +32,48 @@ func closeProducer(t *testing.T, p AsyncProducer) {
 	wg.Wait()
 }
 
-func expectSuccesses(t *testing.T, p AsyncProducer, successes int) {
-	for i := 0; i < successes; i++ {
+func expectResults(t *testing.T, p AsyncProducer, successes, errors int) {
+	for successes > 0 || errors > 0 {
 		select {
 		case msg := <-p.Errors():
-			t.Error(msg.Err)
 			if msg.Msg.flags != 0 {
 				t.Error("Message had flags set")
 			}
+			errors--
+			if errors < 0 {
+				t.Error(msg.Err)
+			}
 		case msg := <-p.Successes():
 			if msg.flags != 0 {
 				t.Error("Message had flags set")
 			}
+			successes--
+			if successes < 0 {
+				t.Error("Too many successes")
+			}
 		}
 	}
 }
 
+type testPartitioner chan *int32
+
+func (p testPartitioner) Partition(msg *ProducerMessage, numPartitions int32) (int32, error) {
+	part := <-p
+	if part == nil {
+		return 0, errors.New("BOOM")
+	}
+
+	return *part, nil
+}
+
+func (p testPartitioner) RequiresConsistency() bool {
+	return true
+}
+
+func (p testPartitioner) feed(partition int32) {
+	p <- &partition
+}
+
 func TestAsyncProducer(t *testing.T) {
 	seedBroker := newMockBroker(t, 1)
 	leader := newMockBroker(t, 2)
@@ -119,7 +147,7 @@ func TestAsyncProducerMultipleFlushes(t *testing.T) {
 		for i := 0; i < 5; i++ {
 			producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage)}
 		}
-		expectSuccesses(t, producer, 5)
+		expectResults(t, producer, 5, 0)
 	}
 
 	closeProducer(t, producer)
@@ -159,7 +187,7 @@ func TestAsyncProducerMultipleBrokers(t *testing.T) {
 	for i := 0; i < 10; i++ {
 		producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage)}
 	}
-	expectSuccesses(t, producer, 10)
+	expectResults(t, producer, 10, 0)
 
 	closeProducer(t, producer)
 	leader1.Close()
@@ -167,6 +195,48 @@ func TestAsyncProducerMultipleBrokers(t *testing.T) {
 	seedBroker.Close()
 }
 
+func TestAsyncProducerCustomPartitioner(t *testing.T) {
+	seedBroker := newMockBroker(t, 1)
+	leader := newMockBroker(t, 2)
+
+	metadataResponse := new(MetadataResponse)
+	metadataResponse.AddBroker(leader.Addr(), leader.BrokerID())
+	metadataResponse.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
+	seedBroker.Returns(metadataResponse)
+
+	prodResponse := new(ProduceResponse)
+	prodResponse.AddTopicPartition("my_topic", 0, ErrNoError)
+	leader.Returns(prodResponse)
+
+	config := NewConfig()
+	config.Producer.Flush.Messages = 2
+	config.Producer.Return.Successes = true
+	config.Producer.Partitioner = func(topic string) Partitioner {
+		p := make(testPartitioner)
+		go func() {
+			p.feed(0)
+			p <- nil
+			p <- nil
+			p <- nil
+			p.feed(0)
+		}()
+		return p
+	}
+	producer, err := NewAsyncProducer([]string{seedBroker.Addr()}, config)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for i := 0; i < 5; i++ {
+		producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage)}
+	}
+	expectResults(t, producer, 2, 3)
+
+	closeProducer(t, producer)
+	leader.Close()
+	seedBroker.Close()
+}
+
 func TestAsyncProducerFailureRetry(t *testing.T) {
 	seedBroker := newMockBroker(t, 1)
 	leader1 := newMockBroker(t, 2)
@@ -202,14 +272,14 @@ func TestAsyncProducerFailureRetry(t *testing.T) {
 	prodSuccess := new(ProduceResponse)
 	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
 	leader2.Returns(prodSuccess)
-	expectSuccesses(t, producer, 10)
+	expectResults(t, producer, 10, 0)
 	leader1.Close()
 
 	for i := 0; i < 10; i++ {
 		producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage)}
 	}
 	leader2.Returns(prodSuccess)
-	expectSuccesses(t, producer, 10)
+	expectResults(t, producer, 10, 0)
 
 	leader2.Close()
 	closeProducer(t, producer)
@@ -244,7 +314,7 @@ func TestAsyncProducerBrokerBounce(t *testing.T) {
 	prodSuccess := new(ProduceResponse)
 	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
 	leader.Returns(prodSuccess)
-	expectSuccesses(t, producer, 10)
+	expectResults(t, producer, 10, 0)
 	seedBroker.Close()
 	leader.Close()
 
@@ -287,7 +357,7 @@ func TestAsyncProducerBrokerBounceWithStaleMetadata(t *testing.T) {
 	prodSuccess := new(ProduceResponse)
 	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
 	leader2.Returns(prodSuccess)
-	expectSuccesses(t, producer, 10)
+	expectResults(t, producer, 10, 0)
 	seedBroker.Close()
 	leader2.Close()
 
@@ -335,13 +405,13 @@ func TestAsyncProducerMultipleRetries(t *testing.T) {
 	prodSuccess := new(ProduceResponse)
 	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
 	leader2.Returns(prodSuccess)
-	expectSuccesses(t, producer, 10)
+	expectResults(t, producer, 10, 0)
 
 	for i := 0; i < 10; i++ {
 		producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage)}
 	}
 	leader2.Returns(prodSuccess)
-	expectSuccesses(t, producer, 10)
+	expectResults(t, producer, 10, 0)
 
 	seedBroker.Close()
 	leader1.Close()
@@ -399,7 +469,7 @@ func TestAsyncProducerOutOfRetries(t *testing.T) {
 	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
 	leader.Returns(prodSuccess)
 
-	expectSuccesses(t, producer, 10)
+	expectResults(t, producer, 10, 0)
 
 	leader.Close()
 	seedBroker.Close()
@@ -432,14 +502,14 @@ func TestAsyncProducerRetryWithReferenceOpen(t *testing.T) {
 	prodSuccess := new(ProduceResponse)
 	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
 	leader.Returns(prodSuccess)
-	expectSuccesses(t, producer, 1)
+	expectResults(t, producer, 1, 0)
 
 	// prime partition 1
 	producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage)}
 	prodSuccess = new(ProduceResponse)
 	prodSuccess.AddTopicPartition("my_topic", 1, ErrNoError)
 	leader.Returns(prodSuccess)
-	expectSuccesses(t, producer, 1)
+	expectResults(t, producer, 1, 0)
 
 	// reboot the broker (the producer will get EOF on its existing connection)
 	leader.Close()
@@ -455,7 +525,7 @@ func TestAsyncProducerRetryWithReferenceOpen(t *testing.T) {
 	prodSuccess = new(ProduceResponse)
 	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
 	leader.Returns(prodSuccess)
-	expectSuccesses(t, producer, 1)
+	expectResults(t, producer, 1, 0)
 
 	// shutdown
 	closeProducer(t, producer)
@@ -492,7 +562,7 @@ func TestAsyncProducerFlusherRetryCondition(t *testing.T) {
 		prodSuccess := new(ProduceResponse)
 		prodSuccess.AddTopicPartition("my_topic", p, ErrNoError)
 		leader.Returns(prodSuccess)
-		expectSuccesses(t, producer, 5)
+		expectResults(t, producer, 5, 0)
 	}
 
 	// send more messages on partition 0
@@ -510,14 +580,14 @@ func TestAsyncProducerFlusherRetryCondition(t *testing.T) {
 	prodSuccess := new(ProduceResponse)
 	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
 	leader.Returns(prodSuccess)
-	expectSuccesses(t, producer, 5)
+	expectResults(t, producer, 5, 0)
 
 	// put five more through
 	for i := 0; i < 5; i++ {
 		producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage), Partition: 0}
 	}
 	leader.Returns(prodSuccess)
-	expectSuccesses(t, producer, 5)
+	expectResults(t, producer, 5, 0)
 
 	// shutdown
 	closeProducer(t, producer)
@@ -525,6 +595,55 @@ func TestAsyncProducerFlusherRetryCondition(t *testing.T) {
 	leader.Close()
 }
 
+func TestAsyncProducerRetryShutdown(t *testing.T) {
+	seedBroker := newMockBroker(t, 1)
+	leader := newMockBroker(t, 2)
+
+	metadataLeader := new(MetadataResponse)
+	metadataLeader.AddBroker(leader.Addr(), leader.BrokerID())
+	metadataLeader.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
+	seedBroker.Returns(metadataLeader)
+
+	config := NewConfig()
+	config.Producer.Flush.Messages = 10
+	config.Producer.Return.Successes = true
+	config.Producer.Retry.Backoff = 0
+	producer, err := NewAsyncProducer([]string{seedBroker.Addr()}, config)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for i := 0; i < 10; i++ {
+		producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage)}
+	}
+	producer.AsyncClose()
+	time.Sleep(5 * time.Millisecond) // let the shutdown goroutine kick in
+
+	producer.Input() <- &ProducerMessage{Topic: "FOO"}
+	if err := <-producer.Errors(); err.Err != ErrShuttingDown {
+		t.Error(err)
+	}
+
+	prodNotLeader := new(ProduceResponse)
+	prodNotLeader.AddTopicPartition("my_topic", 0, ErrNotLeaderForPartition)
+	leader.Returns(prodNotLeader)
+
+	seedBroker.Returns(metadataLeader)
+
+	prodSuccess := new(ProduceResponse)
+	prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError)
+	leader.Returns(prodSuccess)
+	expectResults(t, producer, 10, 0)
+
+	seedBroker.Close()
+	leader.Close()
+
+	// wait for the async-closed producer to shut down fully
+	for err := range producer.Errors() {
+		t.Error(err)
+	}
+}
+
 // This example shows how to use the producer while simultaneously
 // reading the Errors channel to know about any failures.
 func ExampleAsyncProducer_select() {

+ 10 - 6
broker.go

@@ -89,7 +89,11 @@ func (b *Broker) Open(conf *Config) error {
 		b.done = make(chan bool)
 		b.responses = make(chan responsePromise, b.conf.Net.MaxOpenRequests-1)
 
-		Logger.Printf("Connected to broker %s\n", b.addr)
+		if b.id >= 0 {
+			Logger.Printf("Connected to broker at %s (registered as #%d)\n", b.addr, b.id)
+		} else {
+			Logger.Printf("Connected to broker at %s (unregistered)\n", b.addr)
+		}
 		go withRecover(b.responseReceiver)
 	})
 
@@ -234,7 +238,7 @@ func (b *Broker) FetchOffset(request *OffsetFetchRequest) (*OffsetFetchResponse,
 	return response, nil
 }
 
-func (b *Broker) send(req requestEncoder, promiseResponse bool) (*responsePromise, error) {
+func (b *Broker) send(rb requestBody, promiseResponse bool) (*responsePromise, error) {
 	b.lock.Lock()
 	defer b.lock.Unlock()
 
@@ -245,8 +249,8 @@ func (b *Broker) send(req requestEncoder, promiseResponse bool) (*responsePromis
 		return nil, ErrNotConnected
 	}
 
-	fullRequest := request{b.correlationID, b.conf.ClientID, req}
-	buf, err := encode(&fullRequest)
+	req := &request{correlationID: b.correlationID, clientID: b.conf.ClientID, body: rb}
+	buf, err := encode(req)
 	if err != nil {
 		return nil, err
 	}
@@ -266,13 +270,13 @@ func (b *Broker) send(req requestEncoder, promiseResponse bool) (*responsePromis
 		return nil, nil
 	}
 
-	promise := responsePromise{fullRequest.correlationID, make(chan []byte), make(chan error)}
+	promise := responsePromise{req.correlationID, make(chan []byte), make(chan error)}
 	b.responses <- promise
 
 	return &promise, nil
 }
 
-func (b *Broker) sendAndReceive(req requestEncoder, res decoder) error {
+func (b *Broker) sendAndReceive(req requestBody, res decoder) error {
 	promise, err := b.send(req, res != nil)
 
 	if err != nil {

+ 241 - 113
client.go

@@ -1,6 +1,7 @@
 package sarama
 
 import (
+	"math/rand"
 	"sort"
 	"sync"
 	"time"
@@ -42,6 +43,15 @@ type Client interface {
 	// offset, OffsetNewest for the offset of the message that will be produced next, or a time.
 	GetOffset(topic string, partitionID int32, time int64) (int64, error)
 
+	// Coordinator returns the coordinating broker for a consumer group. It will return a locally cached
+	// value if it's available. You can call RefreshCoordinator to update the cached value.
+	// This function only works on Kafka 0.8.2 and higher.
+	Coordinator(consumerGroup string) (*Broker, error)
+
+	// RefreshCoordinator retrieves the coordinator for a consumer group and stores it in local cache.
+	// This function only works on Kafka 0.8.2 and higher.
+	RefreshCoordinator(consumerGroup string) error
+
 	// Close shuts down all broker connections managed by this client. It is required to call this function before
 	// a client object passes out of scope, as it will otherwise leak memory. You must close any Producers or Consumers
 	// using a client before you close the client.
@@ -63,23 +73,24 @@ const (
 )
 
 type client struct {
-	conf   *Config
-	closer chan none
+	conf           *Config
+	closer, closed chan none // for shutting down background metadata updater
 
 	// the broker addresses given to us through the constructor are not guaranteed to be returned in
 	// the cluster metadata (I *think* it only returns brokers who are currently leading partitions?)
 	// so we store them separately
-	seedBrokerAddrs []string
-	seedBroker      *Broker
-	deadBrokerAddrs map[string]none
+	seedBrokers []*Broker
+	deadSeeds   []*Broker
 
-	brokers  map[int32]*Broker                       // maps broker ids to brokers
-	metadata map[string]map[int32]*PartitionMetadata // maps topics to partition ids to metadata
+	brokers      map[int32]*Broker                       // maps broker ids to brokers
+	metadata     map[string]map[int32]*PartitionMetadata // maps topics to partition ids to metadata
+	coordinators map[string]int32                        // Maps consumer group names to coordinating broker IDs
 
 	// If the number of partitions is large, we can get some churn calling cachedPartitions,
 	// so the result is cached.  It is important to update this value whenever metadata is changed
 	cachedPartitionsResults map[string][maxPartitionIndex][]int32
-	lock                    sync.RWMutex // protects access to the maps, only one since they're always written together
+
+	lock sync.RWMutex // protects access to the maps that hold cluster state.
 }
 
 // NewClient creates a new Client. It connects to one of the given broker addresses
@@ -103,14 +114,17 @@ func NewClient(addrs []string, conf *Config) (Client, error) {
 	client := &client{
 		conf:                    conf,
 		closer:                  make(chan none),
-		seedBrokerAddrs:         addrs,
-		seedBroker:              NewBroker(addrs[0]),
-		deadBrokerAddrs:         make(map[string]none),
+		closed:                  make(chan none),
 		brokers:                 make(map[int32]*Broker),
 		metadata:                make(map[string]map[int32]*PartitionMetadata),
 		cachedPartitionsResults: make(map[string][maxPartitionIndex][]int32),
+		coordinators:            make(map[string]int32),
+	}
+
+	random := rand.New(rand.NewSource(time.Now().UnixNano()))
+	for _, index := range random.Perm(len(addrs)) {
+		client.seedBrokers = append(client.seedBrokers, NewBroker(addrs[index]))
 	}
-	_ = client.seedBroker.Open(conf)
 
 	// do an initial fetch of all cluster metadata by specifing an empty list of topics
 	err := client.RefreshMetadata()
@@ -121,6 +135,7 @@ func NewClient(addrs []string, conf *Config) (Client, error) {
 		// indicates that maybe part of the cluster is down, but is not fatal to creating the client
 		Logger.Println(err)
 	default:
+		close(client.closed) // we haven't started the background updater yet, so we have to do this manually
 		_ = client.Close()
 		return nil, err
 	}
@@ -136,7 +151,6 @@ func (client *client) Config() *Config {
 }
 
 func (client *client) Close() error {
-	// Check to see whether the client is closed
 	if client.Closed() {
 		// Chances are this is being called from a defer() and the error will go unobserved
 		// so we go ahead and log the event in this case.
@@ -144,6 +158,10 @@ func (client *client) Close() error {
 		return ErrClosedClient
 	}
 
+	// shutdown and wait for the background thread before we take the lock, to avoid races
+	close(client.closer)
+	<-client.closed
+
 	client.lock.Lock()
 	defer client.lock.Unlock()
 	Logger.Println("Closing Client")
@@ -151,14 +169,13 @@ func (client *client) Close() error {
 	for _, broker := range client.brokers {
 		safeAsyncClose(broker)
 	}
-	client.brokers = nil
-	client.metadata = nil
 
-	if client.seedBroker != nil {
-		safeAsyncClose(client.seedBroker)
+	for _, broker := range client.seedBrokers {
+		safeAsyncClose(broker)
 	}
 
-	close(client.closer)
+	client.brokers = nil
+	client.metadata = nil
 
 	return nil
 }
@@ -168,7 +185,6 @@ func (client *client) Closed() bool {
 }
 
 func (client *client) Topics() ([]string, error) {
-	// Check to see whether the client is closed
 	if client.Closed() {
 		return nil, ErrClosedClient
 	}
@@ -185,7 +201,6 @@ func (client *client) Topics() ([]string, error) {
 }
 
 func (client *client) Partitions(topic string) ([]int32, error) {
-	// Check to see whether the client is closed
 	if client.Closed() {
 		return nil, ErrClosedClient
 	}
@@ -208,7 +223,6 @@ func (client *client) Partitions(topic string) ([]int32, error) {
 }
 
 func (client *client) WritablePartitions(topic string) ([]int32, error) {
-	// Check to see whether the client is closed
 	if client.Closed() {
 		return nil, ErrClosedClient
 	}
@@ -262,6 +276,10 @@ func (client *client) Replicas(topic string, partitionID int32) ([]int32, error)
 }
 
 func (client *client) Leader(topic string, partitionID int32) (*Broker, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
 	leader, err := client.cachedLeader(topic, partitionID)
 
 	if leader == nil {
@@ -293,54 +311,92 @@ func (client *client) RefreshMetadata(topics ...string) error {
 }
 
 func (client *client) GetOffset(topic string, partitionID int32, time int64) (int64, error) {
-	broker, err := client.Leader(topic, partitionID)
-	if err != nil {
-		return -1, err
+	if client.Closed() {
+		return -1, ErrClosedClient
 	}
 
-	request := &OffsetRequest{}
-	request.AddBlock(topic, partitionID, time, 1)
+	offset, err := client.getOffset(topic, partitionID, time)
 
-	response, err := broker.GetAvailableOffsets(request)
 	if err != nil {
-		return -1, err
+		if err := client.RefreshMetadata(topic); err != nil {
+			return -1, err
+		}
+		return client.getOffset(topic, partitionID, time)
 	}
 
-	block := response.GetBlock(topic, partitionID)
-	if block == nil {
-		return -1, ErrIncompleteResponse
+	return offset, err
+}
+
+func (client *client) Coordinator(consumerGroup string) (*Broker, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
 	}
-	if block.Err != ErrNoError {
-		return -1, block.Err
+
+	coordinator := client.cachedCoordinator(consumerGroup)
+
+	if coordinator == nil {
+		if err := client.RefreshCoordinator(consumerGroup); err != nil {
+			return nil, err
+		}
+		coordinator = client.cachedCoordinator(consumerGroup)
 	}
-	if len(block.Offsets) != 1 {
-		return -1, ErrOffsetOutOfRange
+
+	if coordinator == nil {
+		return nil, ErrConsumerCoordinatorNotAvailable
 	}
 
-	return block.Offsets[0], nil
+	_ = coordinator.Open(client.conf)
+	return coordinator, nil
 }
 
-// private broker management helpers
+func (client *client) RefreshCoordinator(consumerGroup string) error {
+	if client.Closed() {
+		return ErrClosedClient
+	}
+
+	response, err := client.getConsumerMetadata(consumerGroup, client.conf.Metadata.Retry.Max)
+	if err != nil {
+		return err
+	}
 
-func (client *client) disconnectBroker(broker *Broker) {
 	client.lock.Lock()
 	defer client.lock.Unlock()
+	client.registerBroker(response.Coordinator)
+	client.coordinators[consumerGroup] = response.Coordinator.ID()
+	return nil
+}
 
-	client.deadBrokerAddrs[broker.addr] = none{}
+// private broker management helpers
 
-	if broker == client.seedBroker {
-		client.seedBrokerAddrs = client.seedBrokerAddrs[1:]
-		if len(client.seedBrokerAddrs) > 0 {
-			client.seedBroker = NewBroker(client.seedBrokerAddrs[0])
-			_ = client.seedBroker.Open(client.conf)
-		} else {
-			client.seedBroker = nil
-		}
+// registerBroker makes sure a broker received by a Metadata or Coordinator request is registered
+// in the brokers map. It returns the broker that is registered, which may be the provided broker,
+// or a previously registered Broker instance. You must hold the write lock before calling this function.
+func (client *client) registerBroker(broker *Broker) {
+	if client.brokers[broker.ID()] == nil {
+		client.brokers[broker.ID()] = broker
+		Logger.Printf("client/brokers registered new broker #%d at %s", broker.ID(), broker.Addr())
+	} else if broker.Addr() != client.brokers[broker.ID()].Addr() {
+		safeAsyncClose(client.brokers[broker.ID()])
+		client.brokers[broker.ID()] = broker
+		Logger.Printf("client/brokers replaced registered broker #%d with %s", broker.ID(), broker.Addr())
+	}
+}
+
+// deregisterBroker removes a broker from the seedsBroker list, and if it's
+// not the seedbroker, removes it from brokers map completely.
+func (client *client) deregisterBroker(broker *Broker) {
+	client.lock.Lock()
+	defer client.lock.Unlock()
+
+	if len(client.seedBrokers) > 0 && broker == client.seedBrokers[0] {
+		client.deadSeeds = append(client.deadSeeds, broker)
+		client.seedBrokers = client.seedBrokers[1:]
 	} else {
 		// we do this so that our loop in `tryRefreshMetadata` doesn't go on forever,
 		// but we really shouldn't have to; once that loop is made better this case can be
-		// removed, and the function generally can be renamed from `disconnectBroker` to
+		// removed, and the function generally can be renamed from `deregisterBroker` to
 		// `nextSeedBroker` or something
+		Logger.Printf("client/brokers deregistered broker #%d at %s", broker.ID(), broker.Addr())
 		delete(client.brokers, broker.ID())
 	}
 }
@@ -349,29 +405,21 @@ func (client *client) resurrectDeadBrokers() {
 	client.lock.Lock()
 	defer client.lock.Unlock()
 
-	for _, addr := range client.seedBrokerAddrs {
-		client.deadBrokerAddrs[addr] = none{}
-	}
-
-	client.seedBrokerAddrs = []string{}
-	for addr := range client.deadBrokerAddrs {
-		client.seedBrokerAddrs = append(client.seedBrokerAddrs, addr)
-	}
-	client.deadBrokerAddrs = make(map[string]none)
-
-	client.seedBroker = NewBroker(client.seedBrokerAddrs[0])
-	_ = client.seedBroker.Open(client.conf)
+	Logger.Printf("client/brokers resurrecting %d dead seed brokers", len(client.deadSeeds))
+	client.seedBrokers = append(client.seedBrokers, client.deadSeeds...)
+	client.deadSeeds = nil
 }
 
 func (client *client) any() *Broker {
 	client.lock.RLock()
 	defer client.lock.RUnlock()
 
-	if client.seedBroker != nil {
-		_ = client.seedBroker.Open(client.conf)
-		return client.seedBroker
+	if len(client.seedBrokers) > 0 {
+		_ = client.seedBrokers[0].Open(client.conf)
+		return client.seedBrokers[0]
 	}
 
+	// not guaranteed to be random *or* deterministic
 	for _, broker := range client.brokers {
 		_ = broker.Open(client.conf)
 		return broker
@@ -459,14 +507,48 @@ func (client *client) cachedLeader(topic string, partitionID int32) (*Broker, er
 	return nil, ErrUnknownTopicOrPartition
 }
 
+func (client *client) getOffset(topic string, partitionID int32, time int64) (int64, error) {
+	broker, err := client.Leader(topic, partitionID)
+	if err != nil {
+		return -1, err
+	}
+
+	request := &OffsetRequest{}
+	request.AddBlock(topic, partitionID, time, 1)
+
+	response, err := broker.GetAvailableOffsets(request)
+	if err != nil {
+		_ = broker.Close()
+		return -1, err
+	}
+
+	block := response.GetBlock(topic, partitionID)
+	if block == nil {
+		_ = broker.Close()
+		return -1, ErrIncompleteResponse
+	}
+	if block.Err != ErrNoError {
+		return -1, block.Err
+	}
+	if len(block.Offsets) != 1 {
+		return -1, ErrOffsetOutOfRange
+	}
+
+	return block.Offsets[0], nil
+}
+
 // core metadata update logic
 
 func (client *client) backgroundMetadataUpdater() {
+	defer close(client.closed)
+
 	if client.conf.Metadata.RefreshFrequency == time.Duration(0) {
 		return
 	}
 
 	ticker := time.NewTicker(client.conf.Metadata.RefreshFrequency)
+	defer ticker.Stop()
+
 	for {
 		select {
 		case <-ticker.C:
@@ -474,64 +556,57 @@ func (client *client) backgroundMetadataUpdater() {
 				Logger.Println("Client background metadata update:", err)
 			}
 		case <-client.closer:
-			ticker.Stop()
 			return
 		}
 	}
 }
 
-func (client *client) tryRefreshMetadata(topics []string, retriesRemaining int) error {
+func (client *client) tryRefreshMetadata(topics []string, attemptsRemaining int) error {
+	retry := func(err error) error {
+		if attemptsRemaining > 0 {
+			Logger.Printf("client/metadata retrying after %dms... (%d attempts remaining)\n", client.conf.Metadata.Retry.Backoff/time.Millisecond, attemptsRemaining)
+			time.Sleep(client.conf.Metadata.Retry.Backoff)
+			return client.tryRefreshMetadata(topics, attemptsRemaining-1)
+		}
+		return err
+	}
+
 	for broker := client.any(); broker != nil; broker = client.any() {
 		if len(topics) > 0 {
-			Logger.Printf("Fetching metadata for %v from broker %s\n", topics, broker.addr)
+			Logger.Printf("client/metadata fetching metadata for %v from broker %s\n", topics, broker.addr)
 		} else {
-			Logger.Printf("Fetching metadata for all topics from broker %s\n", broker.addr)
+			Logger.Printf("client/metadata fetching metadata for all topics from broker %s\n", broker.addr)
 		}
 		response, err := broker.GetMetadata(&MetadataRequest{Topics: topics})
 
 		switch err.(type) {
 		case nil:
 			// valid response, use it
-			retry, err := client.updateMetadata(response)
-
-			if len(retry) > 0 {
-				if retriesRemaining <= 0 {
-					Logger.Println("Some partitions are leaderless, but we're out of retries")
-					return err
-				}
-				Logger.Printf("Some partitions are leaderless, waiting %dms for election... (%d retries remaining)\n",
-					client.conf.Metadata.Retry.Backoff/time.Millisecond, retriesRemaining)
-				time.Sleep(client.conf.Metadata.Retry.Backoff) // wait for leader election
-				return client.tryRefreshMetadata(retry, retriesRemaining-1)
+			if shouldRetry, err := client.updateMetadata(response); shouldRetry {
+				Logger.Println("client/metadata found some partitions to be leaderless")
+				return retry(err) // note: err can be nil
+			} else {
+				return err
 			}
 
-			return err
 		case PacketEncodingError:
 			// didn't even send, return the error
 			return err
 		default:
 			// some other error, remove that broker and try again
-			Logger.Println("Error from broker while fetching metadata:", err)
+			Logger.Println("client/metadata got error from broker while fetching metadata:", err)
 			_ = broker.Close()
-			client.disconnectBroker(broker)
+			client.deregisterBroker(broker)
 		}
 	}
 
-	Logger.Println("Out of available brokers.")
-
-	if retriesRemaining > 0 {
-		Logger.Printf("Resurrecting dead brokers after %dms... (%d retries remaining)\n",
-			client.conf.Metadata.Retry.Backoff/time.Millisecond, retriesRemaining)
-		time.Sleep(client.conf.Metadata.Retry.Backoff)
-		client.resurrectDeadBrokers()
-		return client.tryRefreshMetadata(topics, retriesRemaining-1)
-	}
-
-	return ErrOutOfBrokers
+	Logger.Println("client/metadata no available broker to send metadata request to")
+	client.resurrectDeadBrokers()
+	return retry(ErrOutOfBrokers)
 }
 
 // if no fatal error, returns a list of topics that need retrying due to ErrLeaderNotAvailable
-func (client *client) updateMetadata(data *MetadataResponse) ([]string, error) {
+func (client *client) updateMetadata(data *MetadataResponse) (retry bool, err error) {
 	client.lock.Lock()
 	defer client.lock.Unlock()
 
@@ -540,21 +615,10 @@ func (client *client) updateMetadata(data *MetadataResponse) ([]string, error) {
 	// - if it is an existing ID, but the address we have is stale, discard the old one and save it
 	// - otherwise ignore it, replacing our existing one would just bounce the connection
 	for _, broker := range data.Brokers {
-		if client.brokers[broker.ID()] == nil {
-			client.brokers[broker.ID()] = broker
-			Logger.Printf("Registered new broker #%d at %s", broker.ID(), broker.Addr())
-		} else if broker.Addr() != client.brokers[broker.ID()].Addr() {
-			safeAsyncClose(client.brokers[broker.ID()])
-			client.brokers[broker.ID()] = broker
-			Logger.Printf("Replaced registered broker #%d with %s", broker.ID(), broker.Addr())
-		}
+		client.registerBroker(broker)
 	}
 
-	toRetry := make(map[string]bool)
-
-	var err error
 	for _, topic := range data.Topics {
-
 		delete(client.metadata, topic.Name)
 		delete(client.cachedPartitionsResults, topic.Name)
 
@@ -566,10 +630,10 @@ func (client *client) updateMetadata(data *MetadataResponse) ([]string, error) {
 			continue
 		case ErrUnknownTopicOrPartition: // retry, do not store partial partition results
 			err = topic.Err
-			toRetry[topic.Name] = true
+			retry = true
 			continue
-		case ErrLeaderNotAvailable: // retry, but store partiial partition results
-			toRetry[topic.Name] = true
+		case ErrLeaderNotAvailable: // retry, but store partial partition results
+			retry = true
 			break
 		default: // don't retry, don't store partial results
 			Logger.Printf("Unexpected topic-level metadata error: %s", topic.Err)
@@ -581,7 +645,7 @@ func (client *client) updateMetadata(data *MetadataResponse) ([]string, error) {
 		for _, partition := range topic.Partitions {
 			client.metadata[topic.Name][partition.ID] = partition
 			if partition.Err == ErrLeaderNotAvailable {
-				toRetry[topic.Name] = true
+				retry = true
 			}
 		}
 
@@ -591,9 +655,73 @@ func (client *client) updateMetadata(data *MetadataResponse) ([]string, error) {
 		client.cachedPartitionsResults[topic.Name] = partitionCache
 	}
 
-	ret := make([]string, 0, len(toRetry))
-	for topic := range toRetry {
-		ret = append(ret, topic)
+	return
+}
+
+func (client *client) cachedCoordinator(consumerGroup string) *Broker {
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+	if coordinatorID, ok := client.coordinators[consumerGroup]; !ok {
+		return nil
+	} else {
+		return client.brokers[coordinatorID]
+	}
+}
+
+func (client *client) getConsumerMetadata(consumerGroup string, attemptsRemaining int) (*ConsumerMetadataResponse, error) {
+	retry := func(err error) (*ConsumerMetadataResponse, error) {
+		if attemptsRemaining > 0 {
+			Logger.Printf("client/coordinator retrying after %dms... (%d attempts remaining)\n", client.conf.Metadata.Retry.Backoff/time.Millisecond, attemptsRemaining)
+			time.Sleep(client.conf.Metadata.Retry.Backoff)
+			return client.getConsumerMetadata(consumerGroup, attemptsRemaining-1)
+		}
+		return nil, err
 	}
-	return ret, err
+
+	for broker := client.any(); broker != nil; broker = client.any() {
+		Logger.Printf("client/coordinator requesting coordinator for consumergoup %s from %s\n", consumerGroup, broker.Addr())
+
+		request := new(ConsumerMetadataRequest)
+		request.ConsumerGroup = consumerGroup
+
+		response, err := broker.GetConsumerMetadata(request)
+
+		if err != nil {
+			Logger.Printf("client/coordinator request to broker %s failed: %s\n", broker.Addr(), err)
+
+			switch err.(type) {
+			case PacketEncodingError:
+				return nil, err
+			default:
+				_ = broker.Close()
+				client.deregisterBroker(broker)
+				continue
+			}
+		}
+
+		switch response.Err {
+		case ErrNoError:
+			Logger.Printf("client/coordinator coordinator for consumergoup %s is #%d (%s)\n", consumerGroup, response.Coordinator.ID(), response.Coordinator.Addr())
+			return response, nil
+
+		case ErrConsumerCoordinatorNotAvailable:
+			Logger.Printf("client/coordinator coordinator for consumer group %s is not available\n", consumerGroup)
+
+			// This is very ugly, but this scenario will only happen once per cluster.
+			// The __consumer_offsets topic only has to be created one time.
+			// The number of partitions not configurable, but partition 0 should always exist.
+			if _, err := client.Leader("__consumer_offsets", 0); err != nil {
+				Logger.Printf("client/coordinator the __consumer_offsets topic is not initialized completely yet. Waiting 2 seconds...\n")
+				time.Sleep(2 * time.Second)
+			}
+
+			return retry(ErrConsumerCoordinatorNotAvailable)
+		default:
+			return nil, response.Err
+		}
+	}
+
+	Logger.Println("client/coordinator no available broker to send consumer metadata request to")
+	client.resurrectDeadBrokers()
+	return retry(ErrOutOfBrokers)
 }

+ 285 - 3
client_test.go

@@ -2,10 +2,12 @@ package sarama
 
 import (
 	"io"
+	"sync"
 	"testing"
+	"time"
 )
 
-func safeClose(t *testing.T, c io.Closer) {
+func safeClose(t testing.TB, c io.Closer) {
 	err := c.Close()
 	if err != nil {
 		t.Error(err)
@@ -199,6 +201,54 @@ func TestClientMetadata(t *testing.T) {
 	safeClose(t, client)
 }
 
+func TestClientGetOffset(t *testing.T) {
+	seedBroker := newMockBroker(t, 1)
+	leader := newMockBroker(t, 2)
+	leaderAddr := leader.Addr()
+
+	metadata := new(MetadataResponse)
+	metadata.AddTopicPartition("foo", 0, leader.BrokerID(), nil, nil, ErrNoError)
+	metadata.AddBroker(leaderAddr, leader.BrokerID())
+	seedBroker.Returns(metadata)
+
+	client, err := NewClient([]string{seedBroker.Addr()}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	offsetResponse := new(OffsetResponse)
+	offsetResponse.AddTopicPartition("foo", 0, 123)
+	leader.Returns(offsetResponse)
+
+	offset, err := client.GetOffset("foo", 0, OffsetNewest)
+	if err != nil {
+		t.Error(err)
+	}
+	if offset != 123 {
+		t.Error("Unexpected offset, got ", offset)
+	}
+
+	leader.Close()
+	seedBroker.Returns(metadata)
+
+	leader = newMockBrokerAddr(t, 2, leaderAddr)
+	offsetResponse = new(OffsetResponse)
+	offsetResponse.AddTopicPartition("foo", 0, 456)
+	leader.Returns(offsetResponse)
+
+	offset, err = client.GetOffset("foo", 0, OffsetNewest)
+	if err != nil {
+		t.Error(err)
+	}
+	if offset != 456 {
+		t.Error("Unexpected offset, got ", offset)
+	}
+
+	seedBroker.Close()
+	leader.Close()
+	safeClose(t, client)
+}
+
 func TestClientReceivingUnknownTopic(t *testing.T) {
 	seedBroker := newMockBroker(t, 1)
 
@@ -300,11 +350,10 @@ func TestClientRefreshBehaviour(t *testing.T) {
 	metadataResponse2.AddTopicPartition("my_topic", 0xb, leader.BrokerID(), nil, nil, ErrNoError)
 	seedBroker.Returns(metadataResponse2)
 
-	c, err := NewClient([]string{seedBroker.Addr()}, nil)
+	client, err := NewClient([]string{seedBroker.Addr()}, nil)
 	if err != nil {
 		t.Fatal(err)
 	}
-	client := c.(*client)
 
 	parts, err := client.Partitions("my_topic")
 	if err != nil {
@@ -324,3 +373,236 @@ func TestClientRefreshBehaviour(t *testing.T) {
 	seedBroker.Close()
 	safeClose(t, client)
 }
+
+func TestClientResurrectDeadSeeds(t *testing.T) {
+	initialSeed := newMockBroker(t, 0)
+	emptyMetadata := new(MetadataResponse)
+	initialSeed.Returns(emptyMetadata)
+
+	conf := NewConfig()
+	conf.Metadata.Retry.Backoff = 0
+	conf.Metadata.RefreshFrequency = 0
+	c, err := NewClient([]string{initialSeed.Addr()}, conf)
+	if err != nil {
+		t.Fatal(err)
+	}
+	initialSeed.Close()
+
+	client := c.(*client)
+
+	seed1 := newMockBroker(t, 1)
+	seed2 := newMockBroker(t, 2)
+	seed3 := newMockBroker(t, 3)
+	addr1 := seed1.Addr()
+	addr2 := seed2.Addr()
+	addr3 := seed3.Addr()
+
+	// Overwrite the seed brokers with a fixed ordering to make this test deterministic.
+	safeClose(t, client.seedBrokers[0])
+	client.seedBrokers = []*Broker{NewBroker(addr1), NewBroker(addr2), NewBroker(addr3)}
+	client.deadSeeds = []*Broker{}
+
+	wg := sync.WaitGroup{}
+	wg.Add(1)
+	go func() {
+		if err := client.RefreshMetadata(); err != nil {
+			t.Error(err)
+		}
+		wg.Done()
+	}()
+	seed1.Close()
+	seed2.Close()
+
+	seed1 = newMockBrokerAddr(t, 1, addr1)
+	seed2 = newMockBrokerAddr(t, 2, addr2)
+
+	seed3.Close()
+
+	seed1.Close()
+	seed2.Returns(emptyMetadata)
+
+	wg.Wait()
+
+	if len(client.seedBrokers) != 2 {
+		t.Error("incorrect number of live seeds")
+	}
+	if len(client.deadSeeds) != 1 {
+		t.Error("incorrect number of dead seeds")
+	}
+
+	safeClose(t, c)
+}
+
+func TestClientCoordinatorWithConsumerOffsetsTopic(t *testing.T) {
+	seedBroker := newMockBroker(t, 1)
+	staleCoordinator := newMockBroker(t, 2)
+	freshCoordinator := newMockBroker(t, 3)
+
+	replicas := []int32{staleCoordinator.BrokerID(), freshCoordinator.BrokerID()}
+	metadataResponse1 := new(MetadataResponse)
+	metadataResponse1.AddBroker(staleCoordinator.Addr(), staleCoordinator.BrokerID())
+	metadataResponse1.AddBroker(freshCoordinator.Addr(), freshCoordinator.BrokerID())
+	metadataResponse1.AddTopicPartition("__consumer_offsets", 0, replicas[0], replicas, replicas, ErrNoError)
+	seedBroker.Returns(metadataResponse1)
+
+	client, err := NewClient([]string{seedBroker.Addr()}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	coordinatorResponse1 := new(ConsumerMetadataResponse)
+	coordinatorResponse1.Err = ErrConsumerCoordinatorNotAvailable
+	seedBroker.Returns(coordinatorResponse1)
+
+	coordinatorResponse2 := new(ConsumerMetadataResponse)
+	coordinatorResponse2.CoordinatorID = staleCoordinator.BrokerID()
+	coordinatorResponse2.CoordinatorHost = "127.0.0.1"
+	coordinatorResponse2.CoordinatorPort = staleCoordinator.Port()
+
+	seedBroker.Returns(coordinatorResponse2)
+
+	broker, err := client.Coordinator("my_group")
+	if err != nil {
+		t.Error(err)
+	}
+
+	if staleCoordinator.Addr() != broker.Addr() {
+		t.Errorf("Expected coordinator to have address %s, found %s", staleCoordinator.Addr(), broker.Addr())
+	}
+
+	if staleCoordinator.BrokerID() != broker.ID() {
+		t.Errorf("Expected coordinator to have ID %d, found %d", staleCoordinator.BrokerID(), broker.ID())
+	}
+
+	// Grab the cached value
+	broker2, err := client.Coordinator("my_group")
+	if err != nil {
+		t.Error(err)
+	}
+
+	if broker2.Addr() != broker.Addr() {
+		t.Errorf("Expected the coordinator to be the same, but found %s vs. %s", broker2.Addr(), broker.Addr())
+	}
+
+	coordinatorResponse3 := new(ConsumerMetadataResponse)
+	coordinatorResponse3.CoordinatorID = freshCoordinator.BrokerID()
+	coordinatorResponse3.CoordinatorHost = "127.0.0.1"
+	coordinatorResponse3.CoordinatorPort = freshCoordinator.Port()
+
+	seedBroker.Returns(coordinatorResponse3)
+
+	// Refresh the locally cahced value because it's stale
+	if err := client.RefreshCoordinator("my_group"); err != nil {
+		t.Error(err)
+	}
+
+	// Grab the fresh value
+	broker3, err := client.Coordinator("my_group")
+	if err != nil {
+		t.Error(err)
+	}
+
+	if broker3.Addr() != freshCoordinator.Addr() {
+		t.Errorf("Expected the freshCoordinator to be returned, but found %s.", broker3.Addr())
+	}
+
+	freshCoordinator.Close()
+	staleCoordinator.Close()
+	seedBroker.Close()
+	safeClose(t, client)
+}
+
+func TestClientCoordinatorWithoutConsumerOffsetsTopic(t *testing.T) {
+	seedBroker := newMockBroker(t, 1)
+	coordinator := newMockBroker(t, 2)
+
+	metadataResponse1 := new(MetadataResponse)
+	seedBroker.Returns(metadataResponse1)
+
+	config := NewConfig()
+	config.Metadata.Retry.Max = 1
+	config.Metadata.Retry.Backoff = 0
+	client, err := NewClient([]string{seedBroker.Addr()}, config)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	coordinatorResponse1 := new(ConsumerMetadataResponse)
+	coordinatorResponse1.Err = ErrConsumerCoordinatorNotAvailable
+	seedBroker.Returns(coordinatorResponse1)
+
+	metadataResponse2 := new(MetadataResponse)
+	metadataResponse2.AddTopic("__consumer_offsets", ErrUnknownTopicOrPartition)
+	seedBroker.Returns(metadataResponse2)
+
+	replicas := []int32{coordinator.BrokerID()}
+	metadataResponse3 := new(MetadataResponse)
+	metadataResponse3.AddTopicPartition("__consumer_offsets", 0, replicas[0], replicas, replicas, ErrNoError)
+	seedBroker.Returns(metadataResponse3)
+
+	coordinatorResponse2 := new(ConsumerMetadataResponse)
+	coordinatorResponse2.CoordinatorID = coordinator.BrokerID()
+	coordinatorResponse2.CoordinatorHost = "127.0.0.1"
+	coordinatorResponse2.CoordinatorPort = coordinator.Port()
+
+	seedBroker.Returns(coordinatorResponse2)
+
+	broker, err := client.Coordinator("my_group")
+	if err != nil {
+		t.Error(err)
+	}
+
+	if coordinator.Addr() != broker.Addr() {
+		t.Errorf("Expected coordinator to have address %s, found %s", coordinator.Addr(), broker.Addr())
+	}
+
+	if coordinator.BrokerID() != broker.ID() {
+		t.Errorf("Expected coordinator to have ID %d, found %d", coordinator.BrokerID(), broker.ID())
+	}
+
+	coordinator.Close()
+	seedBroker.Close()
+	safeClose(t, client)
+}
+
+func TestClientAutorefreshShutdownRace(t *testing.T) {
+	seedBroker := newMockBroker(t, 1)
+
+	metadataResponse := new(MetadataResponse)
+	seedBroker.Returns(metadataResponse)
+
+	conf := NewConfig()
+	conf.Metadata.RefreshFrequency = 100 * time.Millisecond
+	client, err := NewClient([]string{seedBroker.Addr()}, conf)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Wait for the background refresh to kick in
+	time.Sleep(110 * time.Millisecond)
+
+	done := make(chan none)
+	go func() {
+		// Close the client
+		if err := client.Close(); err != nil {
+			t.Fatal(err)
+		}
+		close(done)
+	}()
+
+	// Wait for the Close to kick in
+	time.Sleep(10 * time.Millisecond)
+
+	// Then return some metadata to the still-running background thread
+	leader := newMockBroker(t, 2)
+	metadataResponse.AddBroker(leader.Addr(), leader.BrokerID())
+	metadataResponse.AddTopicPartition("foo", 0, leader.BrokerID(), []int32{2}, []int32{2}, ErrNoError)
+	seedBroker.Returns(metadataResponse)
+
+	<-done
+
+	seedBroker.Close()
+
+	// give the update time to happen so we get a panic if it's still running (which it shouldn't)
+	time.Sleep(10 * time.Millisecond)
+}

+ 6 - 0
config.go

@@ -20,6 +20,10 @@ type Config struct {
 			Enable bool        // Whether or not to use TLS when connecting to the broker (defaults to false).
 			Config *tls.Config // The TLS configuration to use for secure connections if enabled (defaults to nil).
 		}
+
+		// KeepAlive specifies the keep-alive period for an active network connection.
+		// If zero, keep-alives are disabled. (default is 0: disabled).
+		KeepAlive time.Duration
 	}
 
 	// Metadata is the namespace for metadata management properties used by the Client, and shared by the Producer/Consumer.
@@ -194,6 +198,8 @@ func (c *Config) Validate() error {
 		return ConfigurationError("Invalid Net.ReadTimeout, must be > 0")
 	case c.Net.WriteTimeout <= 0:
 		return ConfigurationError("Invalid Net.WriteTimeout, must be > 0")
+	case c.Net.KeepAlive < 0:
+		return ConfigurationError("Invalid Net.KeepAlive, must be >= 0")
 	}
 
 	// validate the Metadata values

+ 234 - 159
consumer.go

@@ -3,6 +3,7 @@ package sarama
 import (
 	"fmt"
 	"sync"
+	"sync/atomic"
 	"time"
 )
 
@@ -38,7 +39,20 @@ func (ce ConsumerErrors) Error() string {
 // Consumer manages PartitionConsumers which process Kafka messages from brokers. You MUST call Close()
 // on a consumer to avoid leaks, it will not be garbage-collected automatically when it passes out of
 // scope.
+//
+// Sarama's Consumer type does not currently support automatic consumer group rebalancing and offset tracking,
+// however the https://github.com/wvanbergen/kafka library builds on Sarama to add this support. We plan
+// to properly integrate this functionality at a later date.
 type Consumer interface {
+
+	// Topics returns the set of available topics as retrieved from the cluster metadata.
+	// This method is the same as Client.Topics(), and is provided for convenience.
+	Topics() ([]string, error)
+
+	// Partitions returns the sorted list of all partition IDs for the given topic.
+	// This method is the same as Client.Pertitions(), and is provided for convenience.
+	Partitions(topic string) ([]int32, error)
+
 	// ConsumePartition creates a PartitionConsumer on the given topic/partition with the given offset. It will
 	// return an error if this Consumer is already consuming on the given topic/partition. Offset can be a
 	// literal offset, or OffsetNewest or OffsetOldest
@@ -98,6 +112,14 @@ func (c *consumer) Close() error {
 	return nil
 }
 
+func (c *consumer) Topics() ([]string, error) {
+	return c.client.Topics()
+}
+
+func (c *consumer) Partitions(topic string) ([]int32, error) {
+	return c.client.Partitions(topic)
+}
+
 func (c *consumer) ConsumePartition(topic string, partition int32, offset int64) (PartitionConsumer, error) {
 	child := &partitionConsumer{
 		consumer:  c,
@@ -106,8 +128,9 @@ func (c *consumer) ConsumePartition(topic string, partition int32, offset int64)
 		partition: partition,
 		messages:  make(chan *ConsumerMessage, c.conf.ChannelBufferSize),
 		errors:    make(chan *ConsumerError, c.conf.ChannelBufferSize),
+		feeder:    make(chan *FetchResponse, 1),
 		trigger:   make(chan none, 1),
-		dying:     make(chan none),
+		dying:     make(chan error, 1),
 		fetchSize: c.conf.Consumer.Fetch.Default,
 	}
 
@@ -126,6 +149,7 @@ func (c *consumer) ConsumePartition(topic string, partition int32, offset int64)
 	}
 
 	go withRecover(child.dispatcher)
+	go withRecover(child.responseFeeder)
 
 	child.broker = c.refBrokerConsumer(leader)
 	child.broker.input <- child
@@ -162,25 +186,15 @@ func (c *consumer) refBrokerConsumer(broker *Broker) *brokerConsumer {
 	c.lock.Lock()
 	defer c.lock.Unlock()
 
-	brokerWorker := c.brokerConsumers[broker]
-	if brokerWorker == nil {
-		brokerWorker = &brokerConsumer{
-			consumer:         c,
-			broker:           broker,
-			input:            make(chan *partitionConsumer),
-			newSubscriptions: make(chan []*partitionConsumer),
-			wait:             make(chan none),
-			subscriptions:    make(map[*partitionConsumer]none),
-			refs:             0,
-		}
-		go withRecover(brokerWorker.subscriptionManager)
-		go withRecover(brokerWorker.subscriptionConsumer)
-		c.brokerConsumers[broker] = brokerWorker
+	bc := c.brokerConsumers[broker]
+	if bc == nil {
+		bc = c.newBrokerConsumer(broker)
+		c.brokerConsumers[broker] = bc
 	}
 
-	brokerWorker.refs++
+	bc.refs++
 
-	return brokerWorker
+	return bc
 }
 
 func (c *consumer) unrefBrokerConsumer(brokerWorker *brokerConsumer) {
@@ -211,11 +225,12 @@ func (c *consumer) abandonBrokerConsumer(brokerWorker *brokerConsumer) {
 // when it passes out of scope.
 //
 // The simplest way of using a PartitionConsumer is to loop over its Messages channel using a for/range
-// loop. The PartitionConsumer will under no circumstances stop by itself once it is started, it will
-// just keep retrying if it encounters errors. By default, it logs these errors to sarama.Logger;
-// if you want to handle errors yourself, set your config's Consumer.Return.Errors to true, and read
-// from the Errors channel as well, using a select statement or in a separate goroutine. Check out
-// the examples of Consumer to see examples of these different approaches.
+// loop. The PartitionConsumer will only stop itself in one case: when the offset being consumed is reported
+// as out of range by the brokers. In this case you should decide what you want to do (try a different offset,
+// notify a human, etc) and handle it appropriately. For all other error cases, it will just keep retrying.
+// By default, it logs these errors to sarama.Logger; if you want to be notified directly of all errors, set
+// your config's Consumer.Return.Errors to true and read from the Errors channel, using a select statement
+// or a separate goroutine. Check out the Consumer examples to see implementations of these different approaches.
 type PartitionConsumer interface {
 
 	// AsyncClose initiates a shutdown of the PartitionConsumer. This method will return immediately,
@@ -237,6 +252,11 @@ type PartitionConsumer interface {
 	// errors are logged and not returned over this channel. If you want to implement any custom errpr
 	// handling, set your config's Consumer.Return.Errors setting to true, and read from this channel.
 	Errors() <-chan *ConsumerError
+
+	// HighWaterMarkOffset returns the high water mark offset of the partition, i.e. the offset that will
+	// be used for the next message that will be produced. You can use this to determine how far behind
+	// the processing is.
+	HighWaterMarkOffset() int64
 }
 
 type partitionConsumer struct {
@@ -245,13 +265,16 @@ type partitionConsumer struct {
 	topic     string
 	partition int32
 
-	broker         *brokerConsumer
-	messages       chan *ConsumerMessage
-	errors         chan *ConsumerError
-	trigger, dying chan none
+	broker   *brokerConsumer
+	messages chan *ConsumerMessage
+	errors   chan *ConsumerError
+	feeder   chan *FetchResponse
+	trigger  chan none
+	dying    chan error
 
-	fetchSize int32
-	offset    int64
+	fetchSize           int32
+	offset              int64
+	highWaterMarkOffset int64
 }
 
 func (child *partitionConsumer) sendError(err error) {
@@ -279,6 +302,7 @@ func (child *partitionConsumer) dispatcher() {
 				child.broker = nil
 			}
 
+			Logger.Printf("consumer/%s/%d finding new broker\n", child.topic, child.partition)
 			if err := child.dispatch(); err != nil {
 				child.sendError(err)
 				child.trigger <- none{}
@@ -290,8 +314,7 @@ func (child *partitionConsumer) dispatcher() {
 		child.consumer.unrefBrokerConsumer(child.broker)
 	}
 	child.consumer.removeChild(child)
-	close(child.messages)
-	close(child.errors)
+	close(child.feeder)
 }
 
 func (child *partitionConsumer) dispatch() error {
@@ -312,22 +335,28 @@ func (child *partitionConsumer) dispatch() error {
 	return nil
 }
 
-func (child *partitionConsumer) chooseStartingOffset(offset int64) (err error) {
-	var time int64
+func (child *partitionConsumer) chooseStartingOffset(offset int64) error {
+	newestOffset, err := child.consumer.client.GetOffset(child.topic, child.partition, OffsetNewest)
+	if err != nil {
+		return err
+	}
+	oldestOffset, err := child.consumer.client.GetOffset(child.topic, child.partition, OffsetOldest)
+	if err != nil {
+		return err
+	}
 
-	switch offset {
-	case OffsetNewest, OffsetOldest:
-		time = offset
-	default:
-		if offset < 0 {
-			return ConfigurationError("Invalid offset")
-		}
+	switch {
+	case offset == OffsetNewest:
+		child.offset = newestOffset
+	case offset == OffsetOldest:
+		child.offset = oldestOffset
+	case offset >= oldestOffset && offset <= newestOffset:
 		child.offset = offset
-		return nil
+	default:
+		return ErrOffsetOutOfRange
 	}
 
-	child.offset, err = child.consumer.client.GetOffset(child.topic, child.partition, time)
-	return err
+	return nil
 }
 
 func (child *partitionConsumer) Messages() <-chan *ConsumerMessage {
@@ -339,11 +368,11 @@ func (child *partitionConsumer) Errors() <-chan *ConsumerError {
 }
 
 func (child *partitionConsumer) AsyncClose() {
-	// this triggers whatever worker owns this child to abandon it and close its trigger channel, which causes
+	// this triggers whatever broker owns this child to abandon it and close its trigger channel, which causes
 	// the dispatcher to exit its loop, which removes it from the consumer then closes its 'messages' and
 	// 'errors' channel (alternatively, if the child is already at the dispatcher for some reason, that will
 	// also just close itself)
-	close(child.dying)
+	child.dying <- nil
 }
 
 func (child *partitionConsumer) Close() error {
@@ -366,6 +395,104 @@ func (child *partitionConsumer) Close() error {
 	return nil
 }
 
+func (child *partitionConsumer) HighWaterMarkOffset() int64 {
+	return atomic.LoadInt64(&child.highWaterMarkOffset)
+}
+
+func (child *partitionConsumer) responseFeeder() {
+	for response := range child.feeder {
+		switch err := child.handleResponse(response); err {
+		case nil:
+			break
+		case ErrOffsetOutOfRange:
+			// there's no point in retrying this it will just fail the same way again
+			// so shut it down and force the user to choose what to do
+			Logger.Printf("consumer/%s/%d shutting down because %s\n", child.topic, child.partition, err)
+			child.sendError(err)
+			child.AsyncClose()
+		case ErrUnknownTopicOrPartition, ErrNotLeaderForPartition, ErrLeaderNotAvailable:
+			// these three are not fatal errors, but do require redispatching
+			child.dying <- err
+		default:
+			// dunno, tell the user and try redispatching
+			child.sendError(err)
+			child.dying <- err
+		}
+
+		child.broker.acks.Done()
+	}
+
+	close(child.messages)
+	close(child.errors)
+}
+
+func (child *partitionConsumer) handleResponse(response *FetchResponse) error {
+	block := response.GetBlock(child.topic, child.partition)
+	if block == nil {
+		return ErrIncompleteResponse
+	}
+
+	if block.Err != ErrNoError {
+		return block.Err
+	}
+
+	if len(block.MsgSet.Messages) == 0 {
+		// We got no messages. If we got a trailing one then we need to ask for more data.
+		// Otherwise we just poll again and wait for one to be produced...
+		if block.MsgSet.PartialTrailingMessage {
+			if child.conf.Consumer.Fetch.Max > 0 && child.fetchSize == child.conf.Consumer.Fetch.Max {
+				// we can't ask for more data, we've hit the configured limit
+				child.sendError(ErrMessageTooLarge)
+				child.offset++ // skip this one so we can keep processing future messages
+			} else {
+				child.fetchSize *= 2
+				if child.conf.Consumer.Fetch.Max > 0 && child.fetchSize > child.conf.Consumer.Fetch.Max {
+					child.fetchSize = child.conf.Consumer.Fetch.Max
+				}
+			}
+		}
+
+		return nil
+	}
+
+	// we got messages, reset our fetch size in case it was increased for a previous request
+	child.fetchSize = child.conf.Consumer.Fetch.Default
+	atomic.StoreInt64(&child.highWaterMarkOffset, block.HighWaterMarkOffset)
+
+	incomplete := false
+	atLeastOne := false
+	prelude := true
+	for _, msgBlock := range block.MsgSet.Messages {
+
+		for _, msg := range msgBlock.Messages() {
+			if prelude && msg.Offset < child.offset {
+				continue
+			}
+			prelude = false
+
+			if msg.Offset >= child.offset {
+				atLeastOne = true
+				child.messages <- &ConsumerMessage{
+					Topic:     child.topic,
+					Partition: child.partition,
+					Key:       msg.Msg.Key,
+					Value:     msg.Msg.Value,
+					Offset:    msg.Offset,
+				}
+				child.offset = msg.Offset + 1
+			} else {
+				incomplete = true
+			}
+		}
+
+	}
+
+	if incomplete || !atLeastOne {
+		return ErrIncompleteResponse
+	}
+	return nil
+}
+
 // brokerConsumer
 
 type brokerConsumer struct {
@@ -375,10 +502,28 @@ type brokerConsumer struct {
 	newSubscriptions chan []*partitionConsumer
 	wait             chan none
 	subscriptions    map[*partitionConsumer]none
+	acks             sync.WaitGroup
 	refs             int
 }
 
-func (w *brokerConsumer) subscriptionManager() {
+func (c *consumer) newBrokerConsumer(broker *Broker) *brokerConsumer {
+	bc := &brokerConsumer{
+		consumer:         c,
+		broker:           broker,
+		input:            make(chan *partitionConsumer),
+		newSubscriptions: make(chan []*partitionConsumer),
+		wait:             make(chan none),
+		subscriptions:    make(map[*partitionConsumer]none),
+		refs:             0,
+	}
+
+	go withRecover(bc.subscriptionManager)
+	go withRecover(bc.subscriptionConsumer)
+
+	return bc
+}
+
+func (bc *brokerConsumer) subscriptionManager() {
 	var buffer []*partitionConsumer
 
 	// The subscriptionManager constantly accepts new subscriptions on `input` (even when the main subscriptionConsumer
@@ -389,97 +534,98 @@ func (w *brokerConsumer) subscriptionManager() {
 	for {
 		if len(buffer) > 0 {
 			select {
-			case event, ok := <-w.input:
+			case event, ok := <-bc.input:
 				if !ok {
 					goto done
 				}
 				buffer = append(buffer, event)
-			case w.newSubscriptions <- buffer:
+			case bc.newSubscriptions <- buffer:
 				buffer = nil
-			case w.wait <- none{}:
+			case bc.wait <- none{}:
 			}
 		} else {
 			select {
-			case event, ok := <-w.input:
+			case event, ok := <-bc.input:
 				if !ok {
 					goto done
 				}
 				buffer = append(buffer, event)
-			case w.newSubscriptions <- nil:
+			case bc.newSubscriptions <- nil:
 			}
 		}
 	}
 
 done:
-	close(w.wait)
+	close(bc.wait)
 	if len(buffer) > 0 {
-		w.newSubscriptions <- buffer
+		bc.newSubscriptions <- buffer
 	}
-	close(w.newSubscriptions)
+	close(bc.newSubscriptions)
 }
 
-func (w *brokerConsumer) subscriptionConsumer() {
-	<-w.wait // wait for our first piece of work
+func (bc *brokerConsumer) subscriptionConsumer() {
+	<-bc.wait // wait for our first piece of work
 
 	// the subscriptionConsumer ensures we will get nil right away if no new subscriptions is available
-	for newSubscriptions := range w.newSubscriptions {
-		w.updateSubscriptionCache(newSubscriptions)
+	for newSubscriptions := range bc.newSubscriptions {
+		bc.updateSubscriptionCache(newSubscriptions)
 
-		if len(w.subscriptions) == 0 {
+		if len(bc.subscriptions) == 0 {
 			// We're about to be shut down or we're about to receive more subscriptions.
 			// Either way, the signal just hasn't propagated to our goroutine yet.
-			<-w.wait
+			<-bc.wait
 			continue
 		}
 
-		response, err := w.fetchNewMessages()
+		response, err := bc.fetchNewMessages()
 
 		if err != nil {
-			Logger.Printf("Unexpected error processing FetchRequest; disconnecting from broker %s: %s\n", w.broker.addr, err)
-			w.abort(err)
+			Logger.Printf("consumer/broker/%d disconnecting due to error processing FetchRequest: %s\n", bc.broker.ID(), err)
+			bc.abort(err)
 			return
 		}
 
-		for child := range w.subscriptions {
-			block := response.GetBlock(child.topic, child.partition)
-			if block == nil {
-				child.sendError(ErrIncompleteResponse)
-				child.trigger <- none{}
-				delete(w.subscriptions, child)
-				continue
-			}
-
-			w.handleResponse(child, block)
+		bc.acks.Add(len(bc.subscriptions))
+		for child := range bc.subscriptions {
+			child.feeder <- response
 		}
+		bc.acks.Wait()
 	}
 }
 
-func (w *brokerConsumer) updateSubscriptionCache(newSubscriptions []*partitionConsumer) {
+func (bc *brokerConsumer) updateSubscriptionCache(newSubscriptions []*partitionConsumer) {
 	// take new subscriptions, and abandon subscriptions that have been closed
 	for _, child := range newSubscriptions {
-		w.subscriptions[child] = none{}
+		bc.subscriptions[child] = none{}
+		Logger.Printf("consumer/broker/%d added subscription to %s/%d\n", bc.broker.ID(), child.topic, child.partition)
 	}
 
-	for child := range w.subscriptions {
+	for child := range bc.subscriptions {
 		select {
-		case <-child.dying:
-			close(child.trigger)
-			delete(w.subscriptions, child)
+		case err := <-child.dying:
+			if err == nil {
+				Logger.Printf("consumer/broker/%d closed dead subscription to %s/%d\n", bc.broker.ID(), child.topic, child.partition)
+				close(child.trigger)
+			} else {
+				Logger.Printf("consumer/broker/%d abandoned subscription to %s/%d because %s\n", bc.broker.ID(), child.topic, child.partition, err)
+				child.trigger <- none{}
+			}
+			delete(bc.subscriptions, child)
 		default:
 		}
 	}
 }
 
-func (w *brokerConsumer) abort(err error) {
-	w.consumer.abandonBrokerConsumer(w)
-	_ = w.broker.Close() // we don't care about the error this might return, we already have one
+func (bc *brokerConsumer) abort(err error) {
+	bc.consumer.abandonBrokerConsumer(bc)
+	_ = bc.broker.Close() // we don't care about the error this might return, we already have one
 
-	for child := range w.subscriptions {
+	for child := range bc.subscriptions {
 		child.sendError(err)
 		child.trigger <- none{}
 	}
 
-	for newSubscription := range w.newSubscriptions {
+	for newSubscription := range bc.newSubscriptions {
 		for _, child := range newSubscription {
 			child.sendError(err)
 			child.trigger <- none{}
@@ -487,86 +633,15 @@ func (w *brokerConsumer) abort(err error) {
 	}
 }
 
-func (w *brokerConsumer) fetchNewMessages() (*FetchResponse, error) {
+func (bc *brokerConsumer) fetchNewMessages() (*FetchResponse, error) {
 	request := &FetchRequest{
-		MinBytes:    w.consumer.conf.Consumer.Fetch.Min,
-		MaxWaitTime: int32(w.consumer.conf.Consumer.MaxWaitTime / time.Millisecond),
+		MinBytes:    bc.consumer.conf.Consumer.Fetch.Min,
+		MaxWaitTime: int32(bc.consumer.conf.Consumer.MaxWaitTime / time.Millisecond),
 	}
 
-	for child := range w.subscriptions {
+	for child := range bc.subscriptions {
 		request.AddBlock(child.topic, child.partition, child.offset, child.fetchSize)
 	}
 
-	return w.broker.Fetch(request)
-}
-
-func (w *brokerConsumer) handleResponse(child *partitionConsumer, block *FetchResponseBlock) {
-	switch block.Err {
-	case ErrNoError:
-		break
-	default:
-		child.sendError(block.Err)
-		fallthrough
-	case ErrUnknownTopicOrPartition, ErrNotLeaderForPartition, ErrLeaderNotAvailable:
-		// doesn't belong to us, redispatch it
-		child.trigger <- none{}
-		delete(w.subscriptions, child)
-		return
-	}
-
-	if len(block.MsgSet.Messages) == 0 {
-		// We got no messages. If we got a trailing one then we need to ask for more data.
-		// Otherwise we just poll again and wait for one to be produced...
-		if block.MsgSet.PartialTrailingMessage {
-			if child.conf.Consumer.Fetch.Max > 0 && child.fetchSize == child.conf.Consumer.Fetch.Max {
-				// we can't ask for more data, we've hit the configured limit
-				child.sendError(ErrMessageTooLarge)
-				child.offset++ // skip this one so we can keep processing future messages
-			} else {
-				child.fetchSize *= 2
-				if child.conf.Consumer.Fetch.Max > 0 && child.fetchSize > child.conf.Consumer.Fetch.Max {
-					child.fetchSize = child.conf.Consumer.Fetch.Max
-				}
-			}
-		}
-
-		return
-	}
-
-	// we got messages, reset our fetch size in case it was increased for a previous request
-	child.fetchSize = child.conf.Consumer.Fetch.Default
-
-	incomplete := false
-	atLeastOne := false
-	prelude := true
-	for _, msgBlock := range block.MsgSet.Messages {
-
-		for _, msg := range msgBlock.Messages() {
-			if prelude && msg.Offset < child.offset {
-				continue
-			}
-			prelude = false
-
-			if msg.Offset >= child.offset {
-				atLeastOne = true
-				child.messages <- &ConsumerMessage{
-					Topic:     child.topic,
-					Partition: child.partition,
-					Key:       msg.Msg.Key,
-					Value:     msg.Msg.Value,
-					Offset:    msg.Offset,
-				}
-				child.offset = msg.Offset + 1
-			} else {
-				incomplete = true
-			}
-		}
-
-	}
-
-	if incomplete || !atLeastOne {
-		child.sendError(ErrIncompleteResponse)
-		child.trigger <- none{}
-		delete(w.subscriptions, child)
-	}
+	return bc.broker.Fetch(request)
 }

+ 5 - 0
consumer_metadata_request.go

@@ -8,6 +8,11 @@ func (r *ConsumerMetadataRequest) encode(pe packetEncoder) error {
 	return pe.putString(r.ConsumerGroup)
 }
 
+func (r *ConsumerMetadataRequest) decode(pd packetDecoder) (err error) {
+	r.ConsumerGroup, err = pd.getString()
+	return err
+}
+
 func (r *ConsumerMetadataRequest) key() int16 {
 	return 10
 }

+ 2 - 2
consumer_metadata_request_test.go

@@ -12,8 +12,8 @@ var (
 
 func TestConsumerMetadataRequest(t *testing.T) {
 	request := new(ConsumerMetadataRequest)
-	testEncodable(t, "empty string", request, consumerMetadataRequestEmpty)
+	testRequest(t, "empty string", request, consumerMetadataRequestEmpty)
 
 	request.ConsumerGroup = "foobar"
-	testEncodable(t, "with string", request, consumerMetadataRequestString)
+	testRequest(t, "with string", request, consumerMetadataRequestString)
 }

+ 33 - 8
consumer_metadata_response.go

@@ -1,10 +1,16 @@
 package sarama
 
+import (
+	"net"
+	"strconv"
+)
+
 type ConsumerMetadataResponse struct {
 	Err             KError
-	CoordinatorID   int32
-	CoordinatorHost string
-	CoordinatorPort int32
+	Coordinator     *Broker
+	CoordinatorID   int32  // deprecated: use Coordinator.ID()
+	CoordinatorHost string // deprecated: use Coordinator.Addr()
+	CoordinatorPort int32  // deprecated: use Coordinator.Addr()
 }
 
 func (r *ConsumerMetadataResponse) decode(pd packetDecoder) (err error) {
@@ -14,20 +20,39 @@ func (r *ConsumerMetadataResponse) decode(pd packetDecoder) (err error) {
 	}
 	r.Err = KError(tmp)
 
-	r.CoordinatorID, err = pd.getInt32()
-	if err != nil {
+	r.Coordinator = new(Broker)
+	if err := r.Coordinator.decode(pd); err != nil {
 		return err
 	}
 
-	r.CoordinatorHost, err = pd.getString()
+	// this can all go away in 2.0, but we have to fill in deprecated fields to maintain
+	// backwards compatibility
+	host, portstr, err := net.SplitHostPort(r.Coordinator.Addr())
 	if err != nil {
 		return err
 	}
-
-	r.CoordinatorPort, err = pd.getInt32()
+	port, err := strconv.ParseInt(portstr, 10, 32)
 	if err != nil {
 		return err
 	}
+	r.CoordinatorID = r.Coordinator.ID()
+	r.CoordinatorHost = host
+	r.CoordinatorPort = int32(port)
+
+	return nil
+}
+
+func (r *ConsumerMetadataResponse) encode(pe packetEncoder) error {
+
+	pe.putInt16(int16(r.Err))
+
+	pe.putInt32(r.CoordinatorID)
+
+	if err := pe.putString(r.CoordinatorHost); err != nil {
+		return err
+	}
+
+	pe.putInt32(r.CoordinatorPort)
 
 	return nil
 }

+ 8 - 0
consumer_metadata_response_test.go

@@ -58,4 +58,12 @@ func TestConsumerMetadataResponseSuccess(t *testing.T) {
 	if response.CoordinatorPort != 0xCCDD {
 		t.Error("Decoding produced incorrect coordinator port.")
 	}
+
+	if response.Coordinator.ID() != 0xAB {
+		t.Error("Decoding produced incorrect coordinator ID.")
+	}
+
+	if response.Coordinator.Addr() != "foo:52445" {
+		t.Error("Decoding produced incorrect coordinator address.")
+	}
 }

+ 204 - 27
consumer_test.go

@@ -18,7 +18,15 @@ func TestConsumerOffsetManual(t *testing.T) {
 	metadataResponse.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
 	seedBroker.Returns(metadataResponse)
 
-	for i := 0; i <= 10; i++ {
+	offsetResponseNewest := new(OffsetResponse)
+	offsetResponseNewest.AddTopicPartition("my_topic", 0, 2345)
+	leader.Returns(offsetResponseNewest)
+
+	offsetResponseOldest := new(OffsetResponse)
+	offsetResponseOldest.AddTopicPartition("my_topic", 0, 0)
+	leader.Returns(offsetResponseOldest)
+
+	for i := 0; i < 10; i++ {
 		fetchResponse := new(FetchResponse)
 		fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(i+1234))
 		leader.Returns(fetchResponse)
@@ -51,7 +59,7 @@ func TestConsumerOffsetManual(t *testing.T) {
 	leader.Close()
 }
 
-func TestConsumerLatestOffset(t *testing.T) {
+func TestConsumerOffsetNewest(t *testing.T) {
 	seedBroker := newMockBroker(t, 1)
 	leader := newMockBroker(t, 2)
 
@@ -60,12 +68,18 @@ func TestConsumerLatestOffset(t *testing.T) {
 	metadataResponse.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
 	seedBroker.Returns(metadataResponse)
 
-	offsetResponse := new(OffsetResponse)
-	offsetResponse.AddTopicPartition("my_topic", 0, 0x010101)
-	leader.Returns(offsetResponse)
+	offsetResponseNewest := new(OffsetResponse)
+	offsetResponseNewest.AddTopicPartition("my_topic", 0, 10)
+	leader.Returns(offsetResponseNewest)
+
+	offsetResponseOldest := new(OffsetResponse)
+	offsetResponseOldest.AddTopicPartition("my_topic", 0, 7)
+	leader.Returns(offsetResponseOldest)
 
 	fetchResponse := new(FetchResponse)
-	fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), 0x010101)
+	fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), 10)
+	block := fetchResponse.GetBlock("my_topic", 0)
+	block.HighWaterMarkOffset = 14
 	leader.Returns(fetchResponse)
 
 	master, err := NewConsumer([]string{seedBroker.Addr()}, nil)
@@ -79,16 +93,68 @@ func TestConsumerLatestOffset(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	msg := <-consumer.Messages()
+
+	// we deliver one message, so it should be one higher than we return in the OffsetResponse
+	if msg.Offset != 10 {
+		t.Error("Latest message offset not fetched correctly:", msg.Offset)
+	}
+
+	if hwmo := consumer.HighWaterMarkOffset(); hwmo != 14 {
+		t.Errorf("Expected high water mark offset 14, found %d", hwmo)
+	}
+
 	leader.Close()
 	safeClose(t, consumer)
 	safeClose(t, master)
 
-	// we deliver one message, so it should be one higher than we return in the OffsetResponse
-	if consumer.(*partitionConsumer).offset != 0x010102 {
+	// We deliver one message, so it should be one higher than we return in the OffsetResponse.
+	// This way it is set correctly for the next FetchRequest.
+	if consumer.(*partitionConsumer).offset != 11 {
 		t.Error("Latest offset not fetched correctly:", consumer.(*partitionConsumer).offset)
 	}
 }
 
+func TestConsumerShutsDownOutOfRange(t *testing.T) {
+	seedBroker := newMockBroker(t, 1)
+	leader := newMockBroker(t, 2)
+
+	metadataResponse := new(MetadataResponse)
+	metadataResponse.AddBroker(leader.Addr(), leader.BrokerID())
+	metadataResponse.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
+	seedBroker.Returns(metadataResponse)
+
+	offsetResponseNewest := new(OffsetResponse)
+	offsetResponseNewest.AddTopicPartition("my_topic", 0, 1234)
+	leader.Returns(offsetResponseNewest)
+
+	offsetResponseOldest := new(OffsetResponse)
+	offsetResponseOldest.AddTopicPartition("my_topic", 0, 0)
+	leader.Returns(offsetResponseOldest)
+
+	fetchResponse := new(FetchResponse)
+	fetchResponse.AddError("my_topic", 0, ErrOffsetOutOfRange)
+	leader.Returns(fetchResponse)
+
+	master, err := NewConsumer([]string{seedBroker.Addr()}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	seedBroker.Close()
+
+	consumer, err := master.ConsumePartition("my_topic", 0, 101)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if _, ok := <-consumer.Messages(); ok {
+		t.Error("Expected the consumer to shut down")
+	}
+
+	leader.Close()
+	safeClose(t, master)
+}
+
 func TestConsumerFunnyOffsets(t *testing.T) {
 	// for topics that are compressed and/or compacted (different things!) we have to be
 	// able to handle receiving offsets that are non-sequential (though still strictly increasing) and
@@ -101,6 +167,14 @@ func TestConsumerFunnyOffsets(t *testing.T) {
 	metadataResponse.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
 	seedBroker.Returns(metadataResponse)
 
+	offsetResponseNewest := new(OffsetResponse)
+	offsetResponseNewest.AddTopicPartition("my_topic", 0, 1234)
+	leader.Returns(offsetResponseNewest)
+
+	offsetResponseOldest := new(OffsetResponse)
+	offsetResponseOldest.AddTopicPartition("my_topic", 0, 0)
+	leader.Returns(offsetResponseOldest)
+
 	fetchResponse := new(FetchResponse)
 	fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(1))
 	fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(3))
@@ -120,8 +194,11 @@ func TestConsumerFunnyOffsets(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	message := <-consumer.Messages()
-	if message.Offset != 3 {
+	if message := <-consumer.Messages(); message.Offset != 3 {
+		t.Error("Incorrect message offset!")
+	}
+
+	if message := <-consumer.Messages(); message.Offset != 5 {
 		t.Error("Incorrect message offset!")
 	}
 
@@ -152,10 +229,26 @@ func TestConsumerRebalancingMultiplePartitions(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	offsetResponseNewest0 := new(OffsetResponse)
+	offsetResponseNewest0.AddTopicPartition("my_topic", 0, 1234)
+	leader0.Returns(offsetResponseNewest0)
+
+	offsetResponseOldest0 := new(OffsetResponse)
+	offsetResponseOldest0.AddTopicPartition("my_topic", 0, 0)
+	leader0.Returns(offsetResponseOldest0)
+
+	offsetResponseNewest1 := new(OffsetResponse)
+	offsetResponseNewest1.AddTopicPartition("my_topic", 1, 1234)
+	leader1.Returns(offsetResponseNewest1)
+
+	offsetResponseOldest1 := new(OffsetResponse)
+	offsetResponseOldest1.AddTopicPartition("my_topic", 1, 0)
+	leader1.Returns(offsetResponseOldest1)
+
 	// we expect to end up (eventually) consuming exactly ten messages on each partition
 	var wg sync.WaitGroup
-	for i := 0; i < 2; i++ {
-		consumer, err := master.ConsumePartition("my_topic", int32(i), 0)
+	for i := int32(0); i < 2; i++ {
+		consumer, err := master.ConsumePartition("my_topic", i, 0)
 		if err != nil {
 			t.Error(err)
 		}
@@ -179,7 +272,7 @@ func TestConsumerRebalancingMultiplePartitions(t *testing.T) {
 			}
 			safeClose(t, consumer)
 			wg.Done()
-		}(int32(i), consumer)
+		}(i, consumer)
 	}
 
 	// leader0 provides first four messages on partition 0
@@ -236,27 +329,16 @@ func TestConsumerRebalancingMultiplePartitions(t *testing.T) {
 	fetchResponse.AddMessage("my_topic", 1, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(8))
 	fetchResponse.AddMessage("my_topic", 1, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(9))
 	leader0.Returns(fetchResponse)
+	time.Sleep(50 * time.Millisecond) // dumbest way to force a particular response ordering
 
-	// leader0 provides last message  on partition 1
-	fetchResponse = new(FetchResponse)
-	fetchResponse.AddMessage("my_topic", 1, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(10))
-	leader0.Returns(fetchResponse)
-
-	// leader1 provides last message  on partition 0
-	fetchResponse = new(FetchResponse)
-	fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(10))
-	leader1.Returns(fetchResponse)
-
-	wg.Wait()
 	leader1.Close()
 	leader0.Close()
+	wg.Wait()
 	seedBroker.Close()
 	safeClose(t, master)
 }
 
 func TestConsumerInterleavedClose(t *testing.T) {
-	t.Skip("Enable once bug #325 is fixed.")
-
 	seedBroker := newMockBroker(t, 1)
 	leader := newMockBroker(t, 2)
 
@@ -273,6 +355,14 @@ func TestConsumerInterleavedClose(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	offsetResponseNewest0 := new(OffsetResponse)
+	offsetResponseNewest0.AddTopicPartition("my_topic", 0, 1234)
+	leader.Returns(offsetResponseNewest0)
+
+	offsetResponseOldest0 := new(OffsetResponse)
+	offsetResponseOldest0.AddTopicPartition("my_topic", 0, 0)
+	leader.Returns(offsetResponseOldest0)
+
 	c0, err := master.ConsumePartition("my_topic", 0, 0)
 	if err != nil {
 		t.Fatal(err)
@@ -281,12 +371,23 @@ func TestConsumerInterleavedClose(t *testing.T) {
 	fetchResponse := new(FetchResponse)
 	fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(0))
 	leader.Returns(fetchResponse)
+	time.Sleep(50 * time.Millisecond)
+
+	offsetResponseNewest1 := new(OffsetResponse)
+	offsetResponseNewest1.AddTopicPartition("my_topic", 1, 1234)
+	leader.Returns(offsetResponseNewest1)
+
+	offsetResponseOldest1 := new(OffsetResponse)
+	offsetResponseOldest1.AddTopicPartition("my_topic", 1, 0)
+	leader.Returns(offsetResponseOldest1)
 
 	c1, err := master.ConsumePartition("my_topic", 1, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
+	<-c0.Messages()
 
+	fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(1))
 	fetchResponse.AddMessage("my_topic", 1, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(0))
 	leader.Returns(fetchResponse)
 
@@ -301,11 +402,13 @@ func TestConsumerBounceWithReferenceOpen(t *testing.T) {
 	seedBroker := newMockBroker(t, 1)
 	leader := newMockBroker(t, 2)
 	leaderAddr := leader.Addr()
+	tmp := newMockBroker(t, 3)
 
 	metadataResponse := new(MetadataResponse)
 	metadataResponse.AddBroker(leader.Addr(), leader.BrokerID())
+	metadataResponse.AddBroker(tmp.Addr(), tmp.BrokerID())
 	metadataResponse.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
-	metadataResponse.AddTopicPartition("my_topic", 1, leader.BrokerID(), nil, nil, ErrNoError)
+	metadataResponse.AddTopicPartition("my_topic", 1, tmp.BrokerID(), nil, nil, ErrNoError)
 	seedBroker.Returns(metadataResponse)
 
 	config := NewConfig()
@@ -317,17 +420,44 @@ func TestConsumerBounceWithReferenceOpen(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	offsetResponseNewest := new(OffsetResponse)
+	offsetResponseNewest.AddTopicPartition("my_topic", 0, 1234)
+	leader.Returns(offsetResponseNewest)
+
+	offsetResponseOldest := new(OffsetResponse)
+	offsetResponseOldest.AddTopicPartition("my_topic", 0, 0)
+	leader.Returns(offsetResponseOldest)
+
 	c0, err := master.ConsumePartition("my_topic", 0, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
 
+	offsetResponseNewest = new(OffsetResponse)
+	offsetResponseNewest.AddTopicPartition("my_topic", 1, 1234)
+	tmp.Returns(offsetResponseNewest)
+
+	offsetResponseOldest = new(OffsetResponse)
+	offsetResponseOldest.AddTopicPartition("my_topic", 1, 0)
+	tmp.Returns(offsetResponseOldest)
+
 	c1, err := master.ConsumePartition("my_topic", 1, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
 
+	//redirect partition 1 back to main leader
 	fetchResponse := new(FetchResponse)
+	fetchResponse.AddError("my_topic", 1, ErrNotLeaderForPartition)
+	tmp.Returns(fetchResponse)
+	metadataResponse = new(MetadataResponse)
+	metadataResponse.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
+	metadataResponse.AddTopicPartition("my_topic", 1, leader.BrokerID(), nil, nil, ErrNoError)
+	seedBroker.Returns(metadataResponse)
+	time.Sleep(5 * time.Millisecond)
+
+	// now send one message to each partition to make sure everything is primed
+	fetchResponse = new(FetchResponse)
 	fetchResponse.AddMessage("my_topic", 0, nil, ByteEncoder([]byte{0x00, 0x0E}), int64(0))
 	fetchResponse.AddError("my_topic", 1, ErrNoError)
 	leader.Returns(fetchResponse)
@@ -339,6 +469,7 @@ func TestConsumerBounceWithReferenceOpen(t *testing.T) {
 	leader.Returns(fetchResponse)
 	<-c1.Messages()
 
+	// bounce the broker
 	leader.Close()
 	leader = newMockBrokerAddr(t, 2, leaderAddr)
 
@@ -365,6 +496,8 @@ func TestConsumerBounceWithReferenceOpen(t *testing.T) {
 	// send it back to the same broker
 	seedBroker.Returns(metadataResponse)
 
+	time.Sleep(5 * time.Millisecond)
+
 	select {
 	case <-c0.Messages():
 	case <-c1.Messages():
@@ -384,6 +517,50 @@ func TestConsumerBounceWithReferenceOpen(t *testing.T) {
 	}()
 	wg.Wait()
 	safeClose(t, master)
+	tmp.Close()
+}
+
+func TestConsumerOffsetOutOfRange(t *testing.T) {
+	seedBroker := newMockBroker(t, 1)
+	leader := newMockBroker(t, 2)
+
+	metadataResponse := new(MetadataResponse)
+	metadataResponse.AddBroker(leader.Addr(), leader.BrokerID())
+	metadataResponse.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, ErrNoError)
+	seedBroker.Returns(metadataResponse)
+
+	master, err := NewConsumer([]string{seedBroker.Addr()}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	seedBroker.Close()
+
+	offsetResponseNewest := new(OffsetResponse)
+	offsetResponseNewest.AddTopicPartition("my_topic", 0, 1234)
+
+	offsetResponseOldest := new(OffsetResponse)
+	offsetResponseOldest.AddTopicPartition("my_topic", 0, 2345)
+
+	leader.Returns(offsetResponseNewest)
+	leader.Returns(offsetResponseOldest)
+	if _, err := master.ConsumePartition("my_topic", 0, 0); err != ErrOffsetOutOfRange {
+		t.Fatal("Should return ErrOffsetOutOfRange, got:", err)
+	}
+
+	leader.Returns(offsetResponseNewest)
+	leader.Returns(offsetResponseOldest)
+	if _, err := master.ConsumePartition("my_topic", 0, 3456); err != ErrOffsetOutOfRange {
+		t.Fatal("Should return ErrOffsetOutOfRange, got:", err)
+	}
+
+	leader.Returns(offsetResponseNewest)
+	leader.Returns(offsetResponseOldest)
+	if _, err := master.ConsumePartition("my_topic", 0, -3); err != ErrOffsetOutOfRange {
+		t.Fatal("Should return ErrOffsetOutOfRange, got:", err)
+	}
+
+	leader.Close()
+	safeClose(t, master)
 }
 
 // This example has the simplest use case of the consumer. It simply

+ 4 - 4
encoder_decoder.go

@@ -9,15 +9,15 @@ type encoder interface {
 }
 
 // Encode takes an Encoder and turns it into bytes.
-func encode(in encoder) ([]byte, error) {
-	if in == nil {
+func encode(e encoder) ([]byte, error) {
+	if e == nil {
 		return nil, nil
 	}
 
 	var prepEnc prepEncoder
 	var realEnc realEncoder
 
-	err := in.encode(&prepEnc)
+	err := e.encode(&prepEnc)
 	if err != nil {
 		return nil, err
 	}
@@ -27,7 +27,7 @@ func encode(in encoder) ([]byte, error) {
 	}
 
 	realEnc.raw = make([]byte, prepEnc.length)
-	err = in.encode(&realEnc)
+	err = e.encode(&realEnc)
 	if err != nil {
 		return nil, err
 	}

+ 53 - 0
fetch_request.go

@@ -11,6 +11,16 @@ func (f *fetchRequestBlock) encode(pe packetEncoder) error {
 	return nil
 }
 
+func (f *fetchRequestBlock) decode(pd packetDecoder) (err error) {
+	if f.fetchOffset, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if f.maxBytes, err = pd.getInt32(); err != nil {
+		return err
+	}
+	return nil
+}
+
 type FetchRequest struct {
 	MaxWaitTime int32
 	MinBytes    int32
@@ -45,6 +55,49 @@ func (f *FetchRequest) encode(pe packetEncoder) (err error) {
 	return nil
 }
 
+func (f *FetchRequest) decode(pd packetDecoder) (err error) {
+	if _, err = pd.getInt32(); err != nil {
+		return err
+	}
+	if f.MaxWaitTime, err = pd.getInt32(); err != nil {
+		return err
+	}
+	if f.MinBytes, err = pd.getInt32(); err != nil {
+		return err
+	}
+	topicCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if topicCount == 0 {
+		return nil
+	}
+	f.blocks = make(map[string]map[int32]*fetchRequestBlock)
+	for i := 0; i < topicCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitionCount, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+		f.blocks[topic] = make(map[int32]*fetchRequestBlock)
+		for j := 0; j < partitionCount; j++ {
+			partition, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			fetchBlock := &fetchRequestBlock{}
+			if err = fetchBlock.decode(pd); err != nil {
+				return nil
+			}
+			f.blocks[topic][partition] = fetchBlock
+		}
+	}
+	return nil
+}
+
 func (f *FetchRequest) key() int16 {
 	return 1
 }

+ 3 - 3
fetch_request_test.go

@@ -21,14 +21,14 @@ var (
 
 func TestFetchRequest(t *testing.T) {
 	request := new(FetchRequest)
-	testEncodable(t, "no blocks", request, fetchRequestNoBlocks)
+	testRequest(t, "no blocks", request, fetchRequestNoBlocks)
 
 	request.MaxWaitTime = 0x20
 	request.MinBytes = 0xEF
-	testEncodable(t, "with properties", request, fetchRequestWithProperties)
+	testRequest(t, "with properties", request, fetchRequestWithProperties)
 
 	request.MaxWaitTime = 0
 	request.MinBytes = 0
 	request.AddBlock("topic", 0x12, 0x34, 0x56)
-	testEncodable(t, "one block", request, fetchRequestOneBlock)
+	testRequest(t, "one block", request, fetchRequestOneBlock)
 }

+ 90 - 0
functional_client_test.go

@@ -0,0 +1,90 @@
+package sarama
+
+import (
+	"fmt"
+	"testing"
+	"time"
+)
+
+func TestFuncConnectionFailure(t *testing.T) {
+	setupFunctionalTest(t)
+	defer teardownFunctionalTest(t)
+
+	Proxies["kafka1"].Enabled = false
+	SaveProxy(t, "kafka1")
+
+	config := NewConfig()
+	config.Metadata.Retry.Max = 1
+
+	_, err := NewClient([]string{kafkaBrokers[0]}, config)
+	if err != ErrOutOfBrokers {
+		t.Fatal("Expected returned error to be ErrOutOfBrokers, but was: ", err)
+	}
+}
+
+func TestFuncClientMetadata(t *testing.T) {
+	setupFunctionalTest(t)
+	defer teardownFunctionalTest(t)
+
+	config := NewConfig()
+	config.Metadata.Retry.Max = 1
+	config.Metadata.Retry.Backoff = 10 * time.Millisecond
+	client, err := NewClient(kafkaBrokers, config)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if err := client.RefreshMetadata("unknown_topic"); err != ErrUnknownTopicOrPartition {
+		t.Error("Expected ErrUnknownTopicOrPartition, got", err)
+	}
+
+	if _, err := client.Leader("unknown_topic", 0); err != ErrUnknownTopicOrPartition {
+		t.Error("Expected ErrUnknownTopicOrPartition, got", err)
+	}
+
+	if _, err := client.Replicas("invalid/topic", 0); err != ErrUnknownTopicOrPartition {
+		t.Error("Expected ErrUnknownTopicOrPartition, got", err)
+	}
+
+	partitions, err := client.Partitions("test.4")
+	if err != nil {
+		t.Error(err)
+	}
+	if len(partitions) != 4 {
+		t.Errorf("Expected test.4 topic to have 4 partitions, found %v", partitions)
+	}
+
+	partitions, err = client.Partitions("test.1")
+	if err != nil {
+		t.Error(err)
+	}
+	if len(partitions) != 1 {
+		t.Errorf("Expected test.1 topic to have 1 partitions, found %v", partitions)
+	}
+
+	safeClose(t, client)
+}
+
+func TestFuncClientCoordinator(t *testing.T) {
+	checkKafkaVersion(t, "0.8.2")
+	setupFunctionalTest(t)
+	defer teardownFunctionalTest(t)
+
+	client, err := NewClient(kafkaBrokers, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for i := 0; i < 10; i++ {
+		broker, err := client.Coordinator(fmt.Sprintf("another_new_consumer_group_%d", i))
+		if err != nil {
+			t.Error(err)
+		}
+
+		if connected, err := broker.Connected(); !connected || err != nil {
+			t.Errorf("Expected to coordinator %s broker to be properly connected.", broker.Addr())
+		}
+	}
+
+	safeClose(t, client)
+}

+ 61 - 0
functional_consumer_test.go

@@ -0,0 +1,61 @@
+package sarama
+
+import (
+	"math"
+	"testing"
+)
+
+func TestFuncConsumerOffsetOutOfRange(t *testing.T) {
+	setupFunctionalTest(t)
+	defer teardownFunctionalTest(t)
+
+	consumer, err := NewConsumer(kafkaBrokers, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if _, err := consumer.ConsumePartition("test.1", 0, -10); err != ErrOffsetOutOfRange {
+		t.Error("Expected ErrOffsetOutOfRange, got:", err)
+	}
+
+	if _, err := consumer.ConsumePartition("test.1", 0, math.MaxInt64); err != ErrOffsetOutOfRange {
+		t.Error("Expected ErrOffsetOutOfRange, got:", err)
+	}
+
+	safeClose(t, consumer)
+}
+
+func TestConsumerHighWaterMarkOffset(t *testing.T) {
+	setupFunctionalTest(t)
+	defer teardownFunctionalTest(t)
+
+	p, err := NewSyncProducer(kafkaBrokers, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer safeClose(t, p)
+
+	_, offset, err := p.SendMessage(&ProducerMessage{Topic: "test.1", Value: StringEncoder("Test")})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, err := NewConsumer(kafkaBrokers, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer safeClose(t, c)
+
+	pc, err := c.ConsumePartition("test.1", 0, OffsetOldest)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	<-pc.Messages()
+
+	if hwmo := pc.HighWaterMarkOffset(); hwmo != offset+1 {
+		t.Logf("Last produced offset %d; high water mark should be one higher but found %d.", offset, hwmo)
+	}
+
+	safeClose(t, pc)
+}

+ 203 - 0
functional_producer_test.go

@@ -0,0 +1,203 @@
+package sarama
+
+import (
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+)
+
+const TestBatchSize = 1000
+
+func TestFuncProducing(t *testing.T) {
+	config := NewConfig()
+	testProducingMessages(t, config)
+}
+
+func TestFuncProducingGzip(t *testing.T) {
+	config := NewConfig()
+	config.Producer.Compression = CompressionGZIP
+	testProducingMessages(t, config)
+}
+
+func TestFuncProducingSnappy(t *testing.T) {
+	config := NewConfig()
+	config.Producer.Compression = CompressionSnappy
+	testProducingMessages(t, config)
+}
+
+func TestFuncProducingNoResponse(t *testing.T) {
+	config := NewConfig()
+	config.Producer.RequiredAcks = NoResponse
+	testProducingMessages(t, config)
+}
+
+func TestFuncProducingFlushing(t *testing.T) {
+	config := NewConfig()
+	config.Producer.Flush.Messages = TestBatchSize / 8
+	config.Producer.Flush.Frequency = 250 * time.Millisecond
+	testProducingMessages(t, config)
+}
+
+func TestFuncMultiPartitionProduce(t *testing.T) {
+	setupFunctionalTest(t)
+	defer teardownFunctionalTest(t)
+
+	config := NewConfig()
+	config.ChannelBufferSize = 20
+	config.Producer.Flush.Frequency = 50 * time.Millisecond
+	config.Producer.Flush.Messages = 200
+	config.Producer.Return.Successes = true
+	producer, err := NewSyncProducer(kafkaBrokers, config)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var wg sync.WaitGroup
+	wg.Add(TestBatchSize)
+
+	for i := 1; i <= TestBatchSize; i++ {
+		go func(i int) {
+			defer wg.Done()
+			msg := &ProducerMessage{Topic: "test.64", Key: nil, Value: StringEncoder(fmt.Sprintf("hur %d", i))}
+			if _, _, err := producer.SendMessage(msg); err != nil {
+				t.Error(i, err)
+			}
+		}(i)
+	}
+
+	wg.Wait()
+	if err := producer.Close(); err != nil {
+		t.Error(err)
+	}
+}
+
+func TestFuncProducingToInvalidTopic(t *testing.T) {
+	setupFunctionalTest(t)
+	defer teardownFunctionalTest(t)
+
+	producer, err := NewSyncProducer(kafkaBrokers, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if _, _, err := producer.SendMessage(&ProducerMessage{Topic: "in/valid"}); err != ErrUnknownTopicOrPartition {
+		t.Error("Expected ErrUnknownTopicOrPartition, found", err)
+	}
+
+	if _, _, err := producer.SendMessage(&ProducerMessage{Topic: "in/valid"}); err != ErrUnknownTopicOrPartition {
+		t.Error("Expected ErrUnknownTopicOrPartition, found", err)
+	}
+
+	safeClose(t, producer)
+}
+
+func testProducingMessages(t *testing.T, config *Config) {
+	setupFunctionalTest(t)
+	defer teardownFunctionalTest(t)
+
+	config.Producer.Return.Successes = true
+	config.Consumer.Return.Errors = true
+
+	client, err := NewClient(kafkaBrokers, config)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	master, err := NewConsumerFromClient(client)
+	if err != nil {
+		t.Fatal(err)
+	}
+	consumer, err := master.ConsumePartition("test.1", 0, OffsetNewest)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	producer, err := NewAsyncProducerFromClient(client)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedResponses := TestBatchSize
+	for i := 1; i <= TestBatchSize; {
+		msg := &ProducerMessage{Topic: "test.1", Key: nil, Value: StringEncoder(fmt.Sprintf("testing %d", i))}
+		select {
+		case producer.Input() <- msg:
+			i++
+		case ret := <-producer.Errors():
+			t.Fatal(ret.Err)
+		case <-producer.Successes():
+			expectedResponses--
+		}
+	}
+	for expectedResponses > 0 {
+		select {
+		case ret := <-producer.Errors():
+			t.Fatal(ret.Err)
+		case <-producer.Successes():
+			expectedResponses--
+		}
+	}
+	safeClose(t, producer)
+
+	for i := 1; i <= TestBatchSize; i++ {
+		select {
+		case <-time.After(10 * time.Second):
+			t.Fatal("Not received any more events in the last 10 seconds.")
+
+		case err := <-consumer.Errors():
+			t.Error(err)
+
+		case message := <-consumer.Messages():
+			if string(message.Value) != fmt.Sprintf("testing %d", i) {
+				t.Fatalf("Unexpected message with index %d: %s", i, message.Value)
+			}
+		}
+
+	}
+	safeClose(t, consumer)
+	safeClose(t, client)
+}
+
+// Benchmarks
+
+func BenchmarkProducerSmall(b *testing.B) {
+	benchmarkProducer(b, nil, "test.64", ByteEncoder(make([]byte, 128)))
+}
+func BenchmarkProducerMedium(b *testing.B) {
+	benchmarkProducer(b, nil, "test.64", ByteEncoder(make([]byte, 1024)))
+}
+func BenchmarkProducerLarge(b *testing.B) {
+	benchmarkProducer(b, nil, "test.64", ByteEncoder(make([]byte, 8192)))
+}
+func BenchmarkProducerSmallSinglePartition(b *testing.B) {
+	benchmarkProducer(b, nil, "test.1", ByteEncoder(make([]byte, 128)))
+}
+func BenchmarkProducerMediumSnappy(b *testing.B) {
+	conf := NewConfig()
+	conf.Producer.Compression = CompressionSnappy
+	benchmarkProducer(b, conf, "test.1", ByteEncoder(make([]byte, 1024)))
+}
+
+func benchmarkProducer(b *testing.B, conf *Config, topic string, value Encoder) {
+	setupFunctionalTest(b)
+	defer teardownFunctionalTest(b)
+
+	producer, err := NewAsyncProducer(kafkaBrokers, conf)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+
+	for i := 1; i <= b.N; {
+		msg := &ProducerMessage{Topic: topic, Key: StringEncoder(fmt.Sprintf("%d", i)), Value: value}
+		select {
+		case producer.Input() <- msg:
+			i++
+		case ret := <-producer.Errors():
+			b.Fatal(ret.Err)
+		}
+	}
+	safeClose(b, producer)
+}

+ 81 - 192
functional_test.go

@@ -1,29 +1,52 @@
 package sarama
 
 import (
-	"fmt"
 	"log"
+	"math/rand"
 	"net"
 	"os"
+	"strconv"
 	"strings"
-	"sync"
 	"testing"
 	"time"
+
+	toxiproxy "github.com/Shopify/toxiproxy/client"
 )
 
 const (
-	TestBatchSize = 1000
-
-	VagrantKafkaPeers     = "192.168.100.67:6667,192.168.100.67:6668,192.168.100.67:6669,192.168.100.67:6670,192.168.100.67:6671"
+	VagrantToxiproxy      = "http://192.168.100.67:8474"
+	VagrantKafkaPeers     = "192.168.100.67:9091,192.168.100.67:9092,192.168.100.67:9093,192.168.100.67:9094,192.168.100.67:9095"
 	VagrantZookeeperPeers = "192.168.100.67:2181,192.168.100.67:2182,192.168.100.67:2183,192.168.100.67:2184,192.168.100.67:2185"
 )
 
 var (
-	kafkaIsAvailable, kafkaShouldBeAvailable bool
-	kafkaBrokers                             []string
+	kafkaAvailable, kafkaRequired bool
+	kafkaBrokers                  []string
+
+	proxyClient  *toxiproxy.Client
+	Proxies      map[string]*toxiproxy.Proxy
+	ZKProxies    = []string{"zk1", "zk2", "zk3", "zk4", "zk5"}
+	KafkaProxies = []string{"kafka1", "kafka2", "kafka3", "kafka4", "kafka5"}
 )
 
 func init() {
+	if os.Getenv("DEBUG") == "true" {
+		Logger = log.New(os.Stdout, "[sarama] ", log.LstdFlags)
+	}
+
+	seed := time.Now().UTC().UnixNano()
+	if tmp := os.Getenv("TEST_SEED"); tmp != "" {
+		seed, _ = strconv.ParseInt(tmp, 0, 64)
+	}
+	Logger.Println("Using random seed:", seed)
+	rand.Seed(seed)
+
+	proxyAddr := os.Getenv("TOXIPROXY_ADDR")
+	if proxyAddr == "" {
+		proxyAddr = VagrantToxiproxy
+	}
+	proxyClient = toxiproxy.NewClient(proxyAddr)
+
 	kafkaPeers := os.Getenv("KAFKA_PEERS")
 	if kafkaPeers == "" {
 		kafkaPeers = VagrantKafkaPeers
@@ -32,20 +55,16 @@ func init() {
 
 	if c, err := net.DialTimeout("tcp", kafkaBrokers[0], 5*time.Second); err == nil {
 		if err = c.Close(); err == nil {
-			kafkaIsAvailable = true
+			kafkaAvailable = true
 		}
 	}
 
-	kafkaShouldBeAvailable = os.Getenv("CI") != ""
-
-	if os.Getenv("DEBUG") == "true" {
-		Logger = log.New(os.Stdout, "[sarama] ", log.LstdFlags)
-	}
+	kafkaRequired = os.Getenv("CI") != ""
 }
 
-func checkKafkaAvailability(t *testing.T) {
-	if !kafkaIsAvailable {
-		if kafkaShouldBeAvailable {
+func checkKafkaAvailability(t testing.TB) {
+	if !kafkaAvailable {
+		if kafkaRequired {
 			t.Fatalf("Kafka broker is not available on %s. Set KAFKA_PEERS to connect to Kafka on a different location.", kafkaBrokers[0])
 		} else {
 			t.Skipf("Kafka broker is not available on %s. Set KAFKA_PEERS to connect to Kafka on a different location.", kafkaBrokers[0])
@@ -53,205 +72,75 @@ func checkKafkaAvailability(t *testing.T) {
 	}
 }
 
-func TestFuncConnectionFailure(t *testing.T) {
-	config := NewConfig()
-	config.Metadata.Retry.Max = 1
-
-	_, err := NewClient([]string{"localhost:9000"}, config)
-	if err != ErrOutOfBrokers {
-		t.Fatal("Expected returned error to be ErrOutOfBrokers, but was: ", err)
+func checkKafkaVersion(t testing.TB, requiredVersion string) {
+	kafkaVersion := os.Getenv("KAFKA_VERSION")
+	if kafkaVersion == "" {
+		t.Logf("No KAFKA_VERSION set. This tests requires Kafka version %s or higher. Continuing...", requiredVersion)
+	} else {
+		available := parseKafkaVersion(kafkaVersion)
+		required := parseKafkaVersion(requiredVersion)
+		if !available.satisfies(required) {
+			t.Skipf("Kafka version %s is required for this test; you have %s. Skipping...", requiredVersion, kafkaVersion)
+		}
 	}
 }
 
-func TestFuncClientMetadata(t *testing.T) {
-	checkKafkaAvailability(t)
-
-	config := NewConfig()
-	config.Metadata.Retry.Max = 1
-	config.Metadata.Retry.Backoff = 10 * time.Millisecond
-	client, err := NewClient(kafkaBrokers, config)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if err := client.RefreshMetadata("unknown_topic"); err != ErrUnknownTopicOrPartition {
-		t.Error("Expected ErrUnknownTopicOrPartition, got", err)
-	}
-
-	if _, err := client.Leader("unknown_topic", 0); err != ErrUnknownTopicOrPartition {
-		t.Error("Expected ErrUnknownTopicOrPartition, got", err)
-	}
-
-	if _, err := client.Replicas("invalid/topic", 0); err != ErrUnknownTopicOrPartition {
-		t.Error("Expected ErrUnknownTopicOrPartition, got", err)
-	}
-
-	partitions, err := client.Partitions("multi_partition")
-	if err != nil {
-		t.Error(err)
-	}
-	if len(partitions) != 2 {
-		t.Errorf("Expected multi_partition topic to have 2 partitions, found %v", partitions)
-	}
-
-	partitions, err = client.Partitions("single_partition")
-	if err != nil {
+func resetProxies(t testing.TB) {
+	if err := proxyClient.ResetState(); err != nil {
 		t.Error(err)
 	}
-	if len(partitions) != 1 {
-		t.Errorf("Expected single_partition topic to have 1 partitions, found %v", partitions)
-	}
-
-	safeClose(t, client)
-}
-
-func TestFuncProducing(t *testing.T) {
-	config := NewConfig()
-	testProducingMessages(t, config)
-}
-
-func TestFuncProducingGzip(t *testing.T) {
-	config := NewConfig()
-	config.Producer.Compression = CompressionGZIP
-	testProducingMessages(t, config)
-}
-
-func TestFuncProducingSnappy(t *testing.T) {
-	config := NewConfig()
-	config.Producer.Compression = CompressionSnappy
-	testProducingMessages(t, config)
-}
-
-func TestFuncProducingNoResponse(t *testing.T) {
-	config := NewConfig()
-	config.Producer.RequiredAcks = NoResponse
-	testProducingMessages(t, config)
+	Proxies = nil
 }
 
-func TestFuncProducingFlushing(t *testing.T) {
-	config := NewConfig()
-	config.Producer.Flush.Messages = TestBatchSize / 8
-	config.Producer.Flush.Frequency = 250 * time.Millisecond
-	testProducingMessages(t, config)
-}
-
-func TestFuncMultiPartitionProduce(t *testing.T) {
-	checkKafkaAvailability(t)
-
-	config := NewConfig()
-	config.ChannelBufferSize = 20
-	config.Producer.Flush.Frequency = 50 * time.Millisecond
-	config.Producer.Flush.Messages = 200
-	config.Producer.Return.Successes = true
-	producer, err := NewAsyncProducer(kafkaBrokers, config)
+func fetchProxies(t testing.TB) {
+	var err error
+	Proxies, err = proxyClient.Proxies()
 	if err != nil {
 		t.Fatal(err)
 	}
-
-	var wg sync.WaitGroup
-	wg.Add(TestBatchSize)
-
-	for i := 1; i <= TestBatchSize; i++ {
-
-		go func(i int, w *sync.WaitGroup) {
-			defer w.Done()
-			msg := &ProducerMessage{Topic: "multi_partition", Key: nil, Value: StringEncoder(fmt.Sprintf("hur %d", i))}
-			producer.Input() <- msg
-			select {
-			case ret := <-producer.Errors():
-				t.Fatal(ret.Err)
-			case <-producer.Successes():
-			}
-		}(i, &wg)
-	}
-
-	wg.Wait()
-	if err := producer.Close(); err != nil {
-		t.Error(err)
-	}
 }
 
-func TestProducingToInvalidTopic(t *testing.T) {
-	checkKafkaAvailability(t)
-
-	producer, err := NewSyncProducer(kafkaBrokers, nil)
-	if err != nil {
+func SaveProxy(t *testing.T, px string) {
+	if err := Proxies[px].Save(); err != nil {
 		t.Fatal(err)
 	}
-
-	if _, _, err := producer.SendMessage(&ProducerMessage{Topic: "in/valid"}); err != ErrUnknownTopicOrPartition {
-		t.Error("Expected ErrUnknownTopicOrPartition, found", err)
-	}
-
-	if _, _, err := producer.SendMessage(&ProducerMessage{Topic: "in/valid"}); err != ErrUnknownTopicOrPartition {
-		t.Error("Expected ErrUnknownTopicOrPartition, found", err)
-	}
-
-	safeClose(t, producer)
 }
 
-func testProducingMessages(t *testing.T, config *Config) {
+func setupFunctionalTest(t testing.TB) {
 	checkKafkaAvailability(t)
+	resetProxies(t)
+	fetchProxies(t)
+}
 
-	config.Producer.Return.Successes = true
-	config.Consumer.Return.Errors = true
-
-	client, err := NewClient(kafkaBrokers, config)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	master, err := NewConsumerFromClient(client)
-	if err != nil {
-		t.Fatal(err)
-	}
-	consumer, err := master.ConsumePartition("single_partition", 0, OffsetNewest)
-	if err != nil {
-		t.Fatal(err)
-	}
+func teardownFunctionalTest(t testing.TB) {
+	resetProxies(t)
+}
 
-	producer, err := NewAsyncProducerFromClient(client)
-	if err != nil {
-		t.Fatal(err)
-	}
+type kafkaVersion []int
 
-	expectedResponses := TestBatchSize
-	for i := 1; i <= TestBatchSize; {
-		msg := &ProducerMessage{Topic: "single_partition", Key: nil, Value: StringEncoder(fmt.Sprintf("testing %d", i))}
-		select {
-		case producer.Input() <- msg:
-			i++
-		case ret := <-producer.Errors():
-			t.Fatal(ret.Err)
-		case <-producer.Successes():
-			expectedResponses--
-		}
-	}
-	for expectedResponses > 0 {
-		select {
-		case ret := <-producer.Errors():
-			t.Fatal(ret.Err)
-		case <-producer.Successes():
-			expectedResponses--
+func (kv kafkaVersion) satisfies(other kafkaVersion) bool {
+	var ov int
+	for index, v := range kv {
+		if len(other) <= index {
+			ov = 0
+		} else {
+			ov = other[index]
 		}
-	}
-	safeClose(t, producer)
 
-	for i := 1; i <= TestBatchSize; i++ {
-		select {
-		case <-time.After(10 * time.Second):
-			t.Fatal("Not received any more events in the last 10 seconds.")
-
-		case err := <-consumer.Errors():
-			t.Error(err)
-
-		case message := <-consumer.Messages():
-			if string(message.Value) != fmt.Sprintf("testing %d", i) {
-				t.Fatalf("Unexpected message with index %d: %s", i, message.Value)
-			}
+		if v < ov {
+			return false
 		}
+	}
+	return true
+}
 
+func parseKafkaVersion(version string) kafkaVersion {
+	numbers := strings.Split(version, ".")
+	result := make(kafkaVersion, 0, len(numbers))
+	for _, number := range numbers {
+		nr, _ := strconv.Atoi(number)
+		result = append(result, nr)
 	}
-	safeClose(t, consumer)
-	safeClose(t, client)
+
+	return result
 }

+ 1 - 4
message.go

@@ -66,10 +66,7 @@ func (m *Message) encode(pe packetEncoder) error {
 			m.compressedCache = buf.Bytes()
 			payload = m.compressedCache
 		case CompressionSnappy:
-			tmp, err := snappyEncode(m.Value)
-			if err != nil {
-				return err
-			}
+			tmp := snappyEncode(m.Value)
 			m.compressedCache = tmp
 			payload = m.compressedCache
 		default:

+ 20 - 0
metadata_request.go

@@ -19,6 +19,26 @@ func (mr *MetadataRequest) encode(pe packetEncoder) error {
 	return nil
 }
 
+func (mr *MetadataRequest) decode(pd packetDecoder) error {
+	topicCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if topicCount == 0 {
+		return nil
+	}
+
+	mr.Topics = make([]string, topicCount)
+	for i := range mr.Topics {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		mr.Topics[i] = topic
+	}
+	return nil
+}
+
 func (mr *MetadataRequest) key() int16 {
 	return 3
 }

+ 3 - 3
metadata_request_test.go

@@ -19,11 +19,11 @@ var (
 
 func TestMetadataRequest(t *testing.T) {
 	request := new(MetadataRequest)
-	testEncodable(t, "no topics", request, metadataRequestNoTopics)
+	testRequest(t, "no topics", request, metadataRequestNoTopics)
 
 	request.Topics = []string{"topic1"}
-	testEncodable(t, "one topic", request, metadataRequestOneTopic)
+	testRequest(t, "one topic", request, metadataRequestOneTopic)
 
 	request.Topics = []string{"foo", "bar", "baz"}
-	testEncodable(t, "three topics", request, metadataRequestThreeTopics)
+	testRequest(t, "three topics", request, metadataRequestThreeTopics)
 }

+ 1 - 0
mockbroker_test.go

@@ -146,6 +146,7 @@ func newMockBrokerAddr(t *testing.T, brokerID int32, addr string) *mockBroker {
 	if err != nil {
 		t.Fatal(err)
 	}
+	Logger.Printf("mockbroker/%d listening on %s\n", brokerID, broker.listener.Addr().String())
 	_, portStr, err := net.SplitHostPort(broker.listener.Addr().String())
 	if err != nil {
 		t.Fatal(err)

+ 51 - 3
mocks/consumer.go

@@ -2,6 +2,7 @@ package mocks
 
 import (
 	"sync"
+	"sync/atomic"
 
 	"github.com/Shopify/sarama"
 )
@@ -14,6 +15,7 @@ type Consumer struct {
 	t                  ErrorReporter
 	config             *sarama.Config
 	partitionConsumers map[string]map[int32]*PartitionConsumer
+	metadata           map[string][]int32
 }
 
 // NewConsumer returns a new mock Consumer instance. The t argument should
@@ -62,6 +64,39 @@ func (c *Consumer) ConsumePartition(topic string, partition int32, offset int64)
 	return pc, nil
 }
 
+// Topics returns a list of topics, as registered with SetMetadata
+func (c *Consumer) Topics() ([]string, error) {
+	c.l.Lock()
+	defer c.l.Unlock()
+
+	if c.metadata == nil {
+		c.t.Errorf("Unexpected call to Topics. Initialize the mock's topic metadata with SetMetadata.")
+		return nil, sarama.ErrOutOfBrokers
+	}
+
+	var result []string
+	for topic, _ := range c.metadata {
+		result = append(result, topic)
+	}
+	return result, nil
+}
+
+// Partitions returns the list of parititons for the given topic, as registered with SetMetadata
+func (c *Consumer) Partitions(topic string) ([]int32, error) {
+	c.l.Lock()
+	defer c.l.Unlock()
+
+	if c.metadata == nil {
+		c.t.Errorf("Unexpected call to Partitions. Initialize the mock's topic metadata with SetMetadata.")
+		return nil, sarama.ErrOutOfBrokers
+	}
+	if c.metadata[topic] == nil {
+		return nil, sarama.ErrUnknownTopicOrPartition
+	}
+
+	return c.metadata[topic], nil
+}
+
 // Close implements the Close method from the sarama.Consumer interface. It will close
 // all registered PartitionConsumer instances.
 func (c *Consumer) Close() error {
@@ -81,6 +116,15 @@ func (c *Consumer) Close() error {
 // Expectation API
 ///////////////////////////////////////////////////
 
+// SetMetadata sets the clusters topic/partition metadata,
+// which will be returned by Topics() and Partitions().
+func (c *Consumer) SetTopicMetadata(metadata map[string][]int32) {
+	c.l.Lock()
+	defer c.l.Unlock()
+
+	c.metadata = metadata
+}
+
 // ExpectConsumePartition will register a topic/partition, so you can set expectations on it.
 // The registered PartitionConsumer will be returned, so you can set expectations
 // on it using method chanining. Once a topic/partition is registered, you are
@@ -132,13 +176,13 @@ type PartitionConsumer struct {
 	consumed                bool
 	errorsShouldBeDrained   bool
 	messagesShouldBeDrained bool
+	highWaterMarkOffset     int64
 }
 
 func (pc *PartitionConsumer) handleExpectations() {
 	pc.l.Lock()
 	defer pc.l.Unlock()
 
-	var offset int64
 	for ex := range pc.expectations {
 		if ex.Err != nil {
 			pc.errors <- &sarama.ConsumerError{
@@ -147,11 +191,11 @@ func (pc *PartitionConsumer) handleExpectations() {
 				Err:       ex.Err,
 			}
 		} else {
-			offset++
+			atomic.AddInt64(&pc.highWaterMarkOffset, 1)
 
 			ex.Msg.Topic = pc.topic
 			ex.Msg.Partition = pc.partition
-			ex.Msg.Offset = offset
+			ex.Msg.Offset = atomic.LoadInt64(&pc.highWaterMarkOffset)
 
 			pc.messages <- ex.Msg
 		}
@@ -231,6 +275,10 @@ func (pc *PartitionConsumer) Messages() <-chan *sarama.ConsumerMessage {
 	return pc.messages
 }
 
+func (pc *PartitionConsumer) HighWaterMarkOffset() int64 {
+	return atomic.LoadInt64(&pc.highWaterMarkOffset) + 1
+}
+
 ///////////////////////////////////////////////////
 // Expectation API
 ///////////////////////////////////////////////////

+ 58 - 1
mocks/consumer_test.go

@@ -1,6 +1,7 @@
 package mocks
 
 import (
+	"sort"
 	"testing"
 
 	"github.com/Shopify/sarama"
@@ -187,6 +188,62 @@ func TestConsumerMeetsErrorsDrainedExpectation(t *testing.T) {
 	}
 
 	if len(trm.errors) != 0 {
-		t.Errorf("Expected ano expectation failures to be set on the error reporter.")
+		t.Errorf("Expected no expectation failures to be set on the error reporter.")
+	}
+}
+
+func TestConsumerTopicMetadata(t *testing.T) {
+	trm := newTestReporterMock()
+	consumer := NewConsumer(trm, nil)
+
+	consumer.SetTopicMetadata(map[string][]int32{
+		"test1": []int32{0, 1, 2, 3},
+		"test2": []int32{0, 1, 2, 3, 4, 5, 6, 7},
+	})
+
+	topics, err := consumer.Topics()
+	if err != nil {
+		t.Error(t)
+	}
+
+	sortedTopics := sort.StringSlice(topics)
+	sortedTopics.Sort()
+	if len(sortedTopics) != 2 || sortedTopics[0] != "test1" || sortedTopics[1] != "test2" {
+		t.Error("Unexpected topics returned:", sortedTopics)
+	}
+
+	partitions1, err := consumer.Partitions("test1")
+	if err != nil {
+		t.Error(t)
+	}
+
+	if len(partitions1) != 4 {
+		t.Error("Unexpected partitions returned:", len(partitions1))
+	}
+
+	partitions2, err := consumer.Partitions("test2")
+	if err != nil {
+		t.Error(t)
+	}
+
+	if len(partitions2) != 8 {
+		t.Error("Unexpected partitions returned:", len(partitions2))
+	}
+
+	if len(trm.errors) != 0 {
+		t.Errorf("Expected no expectation failures to be set on the error reporter.")
+	}
+}
+
+func TestConsumerUnexpectedTopicMetadata(t *testing.T) {
+	trm := newTestReporterMock()
+	consumer := NewConsumer(trm, nil)
+
+	if _, err := consumer.Topics(); err != sarama.ErrOutOfBrokers {
+		t.Error("Expected sarama.ErrOutOfBrokers, found", err)
+	}
+
+	if len(trm.errors) != 1 {
+		t.Errorf("Expected an expectation failure to be set on the error reporter.")
 	}
 }

+ 115 - 14
offset_commit_request.go

@@ -2,6 +2,7 @@ package sarama
 
 // ReceiveTime is a special value for the timestamp field of Offset Commit Requests which
 // tells the broker to set the timestamp to the time at which the request was received.
+// The timestamp is only used if message version 1 is used, which requires kafka 0.8.2.
 const ReceiveTime int64 = -1
 
 type offsetCommitRequestBlock struct {
@@ -10,41 +11,141 @@ type offsetCommitRequestBlock struct {
 	metadata  string
 }
 
-func (r *offsetCommitRequestBlock) encode(pe packetEncoder) error {
+func (r *offsetCommitRequestBlock) encode(pe packetEncoder, version int16) error {
 	pe.putInt64(r.offset)
-	pe.putInt64(r.timestamp)
+	if version == 1 {
+		pe.putInt64(r.timestamp)
+	} else if r.timestamp != 0 {
+		Logger.Println("Non-zero timestamp specified for OffsetCommitRequest not v1, it will be ignored")
+	}
+
 	return pe.putString(r.metadata)
 }
 
+func (r *offsetCommitRequestBlock) decode(pd packetDecoder, version int16) (err error) {
+	if r.offset, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if version == 1 {
+		if r.timestamp, err = pd.getInt64(); err != nil {
+			return err
+		}
+	}
+	r.metadata, err = pd.getString()
+	return err
+}
+
 type OffsetCommitRequest struct {
-	ConsumerGroup string
-	blocks        map[string]map[int32]*offsetCommitRequestBlock
+	ConsumerGroup           string
+	ConsumerGroupGeneration int32  // v1 or later
+	ConsumerID              string // v1 or later
+	RetentionTime           int64  // v2 or later
+
+	// Version can be:
+	// - 0 (kafka 0.8.1 and later)
+	// - 1 (kafka 0.8.2 and later)
+	// - 2 (kafka 0.8.3 and later)
+	Version int16
+	blocks  map[string]map[int32]*offsetCommitRequestBlock
 }
 
 func (r *OffsetCommitRequest) encode(pe packetEncoder) error {
-	err := pe.putString(r.ConsumerGroup)
-	if err != nil {
+	if r.Version < 0 || r.Version > 2 {
+		return PacketEncodingError{"invalid or unsupported OffsetCommitRequest version field"}
+	}
+
+	if err := pe.putString(r.ConsumerGroup); err != nil {
 		return err
 	}
-	err = pe.putArrayLength(len(r.blocks))
-	if err != nil {
+
+	if r.Version >= 1 {
+		pe.putInt32(r.ConsumerGroupGeneration)
+		if err := pe.putString(r.ConsumerID); err != nil {
+			return err
+		}
+	} else {
+		if r.ConsumerGroupGeneration != 0 {
+			Logger.Println("Non-zero ConsumerGroupGeneration specified for OffsetCommitRequest v0, it will be ignored")
+		}
+		if r.ConsumerID != "" {
+			Logger.Println("Non-empty ConsumerID specified for OffsetCommitRequest v0, it will be ignored")
+		}
+	}
+
+	if r.Version >= 2 {
+		pe.putInt64(r.RetentionTime)
+	} else if r.RetentionTime != 0 {
+		Logger.Println("Non-zero RetentionTime specified for OffsetCommitRequest version <2, it will be ignored")
+	}
+
+	if err := pe.putArrayLength(len(r.blocks)); err != nil {
 		return err
 	}
 	for topic, partitions := range r.blocks {
-		err = pe.putString(topic)
-		if err != nil {
+		if err := pe.putString(topic); err != nil {
 			return err
 		}
-		err = pe.putArrayLength(len(partitions))
-		if err != nil {
+		if err := pe.putArrayLength(len(partitions)); err != nil {
 			return err
 		}
 		for partition, block := range partitions {
 			pe.putInt32(partition)
-			err = block.encode(pe)
+			if err := block.encode(pe, r.Version); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (r *OffsetCommitRequest) decode(pd packetDecoder) (err error) {
+	if r.ConsumerGroup, err = pd.getString(); err != nil {
+		return err
+	}
+
+	if r.Version >= 1 {
+		if r.ConsumerGroupGeneration, err = pd.getInt32(); err != nil {
+			return err
+		}
+		if r.ConsumerID, err = pd.getString(); err != nil {
+			return err
+		}
+	}
+
+	if r.Version >= 2 {
+		if r.RetentionTime, err = pd.getInt64(); err != nil {
+			return err
+		}
+	}
+
+	topicCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if topicCount == 0 {
+		return nil
+	}
+	r.blocks = make(map[string]map[int32]*offsetCommitRequestBlock)
+	for i := 0; i < topicCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitionCount, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+		r.blocks[topic] = make(map[int32]*offsetCommitRequestBlock)
+		for j := 0; j < partitionCount; j++ {
+			partition, err := pd.getInt32()
 			if err != nil {
 				return err
 			}
+			block := &offsetCommitRequestBlock{}
+			if err := block.decode(pd, r.Version); err != nil {
+				return err
+			}
+			r.blocks[topic][partition] = block
 		}
 	}
 	return nil
@@ -55,7 +156,7 @@ func (r *OffsetCommitRequest) key() int16 {
 }
 
 func (r *OffsetCommitRequest) version() int16 {
-	return 0
+	return r.Version
 }
 
 func (r *OffsetCommitRequest) AddBlock(topic string, partitionID int32, offset int64, timestamp int64, metadata string) {

+ 64 - 8
offset_commit_request_test.go

@@ -3,32 +3,88 @@ package sarama
 import "testing"
 
 var (
-	offsetCommitRequestNoGroupNoBlocks = []byte{
-		0x00, 0x00,
+	offsetCommitRequestNoBlocksV0 = []byte{
+		0x00, 0x06, 'f', 'o', 'o', 'b', 'a', 'r',
+		0x00, 0x00, 0x00, 0x00}
+
+	offsetCommitRequestNoBlocksV1 = []byte{
+		0x00, 0x06, 'f', 'o', 'o', 'b', 'a', 'r',
+		0x00, 0x00, 0x11, 0x22,
+		0x00, 0x04, 'c', 'o', 'n', 's',
 		0x00, 0x00, 0x00, 0x00}
 
-	offsetCommitRequestNoBlocks = []byte{
+	offsetCommitRequestNoBlocksV2 = []byte{
 		0x00, 0x06, 'f', 'o', 'o', 'b', 'a', 'r',
+		0x00, 0x00, 0x11, 0x22,
+		0x00, 0x04, 'c', 'o', 'n', 's',
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x44, 0x33,
 		0x00, 0x00, 0x00, 0x00}
 
-	offsetCommitRequestOneBlock = []byte{
+	offsetCommitRequestOneBlockV0 = []byte{
 		0x00, 0x06, 'f', 'o', 'o', 'b', 'a', 'r',
 		0x00, 0x00, 0x00, 0x01,
 		0x00, 0x05, 't', 'o', 'p', 'i', 'c',
 		0x00, 0x00, 0x00, 0x01,
 		0x00, 0x00, 0x52, 0x21,
 		0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xBE, 0xEF,
+		0x00, 0x08, 'm', 'e', 't', 'a', 'd', 'a', 't', 'a'}
+
+	offsetCommitRequestOneBlockV1 = []byte{
+		0x00, 0x06, 'f', 'o', 'o', 'b', 'a', 'r',
+		0x00, 0x00, 0x11, 0x22,
+		0x00, 0x04, 'c', 'o', 'n', 's',
+		0x00, 0x00, 0x00, 0x01,
+		0x00, 0x05, 't', 'o', 'p', 'i', 'c',
+		0x00, 0x00, 0x00, 0x01,
+		0x00, 0x00, 0x52, 0x21,
+		0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xBE, 0xEF,
 		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 		0x00, 0x08, 'm', 'e', 't', 'a', 'd', 'a', 't', 'a'}
+
+	offsetCommitRequestOneBlockV2 = []byte{
+		0x00, 0x06, 'f', 'o', 'o', 'b', 'a', 'r',
+		0x00, 0x00, 0x11, 0x22,
+		0x00, 0x04, 'c', 'o', 'n', 's',
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x44, 0x33,
+		0x00, 0x00, 0x00, 0x01,
+		0x00, 0x05, 't', 'o', 'p', 'i', 'c',
+		0x00, 0x00, 0x00, 0x01,
+		0x00, 0x00, 0x52, 0x21,
+		0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xBE, 0xEF,
+		0x00, 0x08, 'm', 'e', 't', 'a', 'd', 'a', 't', 'a'}
 )
 
-func TestOffsetCommitRequest(t *testing.T) {
+func TestOffsetCommitRequestV0(t *testing.T) {
 	request := new(OffsetCommitRequest)
-	testEncodable(t, "no group, no blocks", request, offsetCommitRequestNoGroupNoBlocks)
+	request.Version = 0
+	request.ConsumerGroup = "foobar"
+	testRequest(t, "no blocks v0", request, offsetCommitRequestNoBlocksV0)
+
+	request.AddBlock("topic", 0x5221, 0xDEADBEEF, 0, "metadata")
+	testRequest(t, "one block v0", request, offsetCommitRequestOneBlockV0)
+}
 
+func TestOffsetCommitRequestV1(t *testing.T) {
+	request := new(OffsetCommitRequest)
 	request.ConsumerGroup = "foobar"
-	testEncodable(t, "no blocks", request, offsetCommitRequestNoBlocks)
+	request.ConsumerID = "cons"
+	request.ConsumerGroupGeneration = 0x1122
+	request.Version = 1
+	testRequest(t, "no blocks v1", request, offsetCommitRequestNoBlocksV1)
 
 	request.AddBlock("topic", 0x5221, 0xDEADBEEF, ReceiveTime, "metadata")
-	testEncodable(t, "one block", request, offsetCommitRequestOneBlock)
+	testRequest(t, "one block v1", request, offsetCommitRequestOneBlockV1)
+}
+
+func TestOffsetCommitRequestV2(t *testing.T) {
+	request := new(OffsetCommitRequest)
+	request.ConsumerGroup = "foobar"
+	request.ConsumerID = "cons"
+	request.ConsumerGroupGeneration = 0x1122
+	request.RetentionTime = 0x4433
+	request.Version = 2
+	testRequest(t, "no blocks v2", request, offsetCommitRequestNoBlocksV2)
+
+	request.AddBlock("topic", 0x5221, 0xDEADBEEF, 0, "metadata")
+	testRequest(t, "one block v2", request, offsetCommitRequestOneBlockV2)
 }

+ 32 - 1
offset_fetch_request.go

@@ -2,10 +2,15 @@ package sarama
 
 type OffsetFetchRequest struct {
 	ConsumerGroup string
+	Version       int16
 	partitions    map[string][]int32
 }
 
 func (r *OffsetFetchRequest) encode(pe packetEncoder) (err error) {
+	if r.Version < 0 || r.Version > 1 {
+		return PacketEncodingError{"invalid or unsupported OffsetFetchRequest version field"}
+	}
+
 	if err = pe.putString(r.ConsumerGroup); err != nil {
 		return err
 	}
@@ -23,12 +28,38 @@ func (r *OffsetFetchRequest) encode(pe packetEncoder) (err error) {
 	return nil
 }
 
+func (r *OffsetFetchRequest) decode(pd packetDecoder) (err error) {
+	if r.ConsumerGroup, err = pd.getString(); err != nil {
+		return err
+	}
+	partitionCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if partitionCount == 0 {
+		return nil
+	}
+	r.partitions = make(map[string][]int32)
+	for i := 0; i < partitionCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitions, err := pd.getInt32Array()
+		if err != nil {
+			return err
+		}
+		r.partitions[topic] = partitions
+	}
+	return nil
+}
+
 func (r *OffsetFetchRequest) key() int16 {
 	return 9
 }
 
 func (r *OffsetFetchRequest) version() int16 {
-	return 0
+	return r.Version
 }
 
 func (r *OffsetFetchRequest) AddPartition(topic string, partitionID int32) {

+ 3 - 3
offset_fetch_request_test.go

@@ -21,11 +21,11 @@ var (
 
 func TestOffsetFetchRequest(t *testing.T) {
 	request := new(OffsetFetchRequest)
-	testEncodable(t, "no group, no partitions", request, offsetFetchRequestNoGroupNoPartitions)
+	testRequest(t, "no group, no partitions", request, offsetFetchRequestNoGroupNoPartitions)
 
 	request.ConsumerGroup = "blah"
-	testEncodable(t, "no partitions", request, offsetFetchRequestNoPartitions)
+	testRequest(t, "no partitions", request, offsetFetchRequestNoPartitions)
 
 	request.AddPartition("topicTheFirst", 0x4F4F4F4F)
-	testEncodable(t, "one partition", request, offsetFetchRequestOnePartition)
+	testRequest(t, "one partition", request, offsetFetchRequestOnePartition)
 }

+ 48 - 1
offset_request.go

@@ -11,6 +11,16 @@ func (r *offsetRequestBlock) encode(pe packetEncoder) error {
 	return nil
 }
 
+func (r *offsetRequestBlock) decode(pd packetDecoder) (err error) {
+	if r.time, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if r.maxOffsets, err = pd.getInt32(); err != nil {
+		return err
+	}
+	return nil
+}
+
 type OffsetRequest struct {
 	blocks map[string]map[int32]*offsetRequestBlock
 }
@@ -32,10 +42,47 @@ func (r *OffsetRequest) encode(pe packetEncoder) error {
 		}
 		for partition, block := range partitions {
 			pe.putInt32(partition)
-			err = block.encode(pe)
+			if err = block.encode(pe); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (r *OffsetRequest) decode(pd packetDecoder) error {
+	// Ignore replica ID
+	if _, err := pd.getInt32(); err != nil {
+		return err
+	}
+	blockCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if blockCount == 0 {
+		return nil
+	}
+	r.blocks = make(map[string]map[int32]*offsetRequestBlock)
+	for i := 0; i < blockCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitionCount, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+		r.blocks[topic] = make(map[int32]*offsetRequestBlock)
+		for j := 0; j < partitionCount; j++ {
+			partition, err := pd.getInt32()
 			if err != nil {
 				return err
 			}
+			block := &offsetRequestBlock{}
+			if err := block.decode(pd); err != nil {
+				return err
+			}
+			r.blocks[topic][partition] = block
 		}
 	}
 	return nil

+ 2 - 2
offset_request_test.go

@@ -19,8 +19,8 @@ var (
 
 func TestOffsetRequest(t *testing.T) {
 	request := new(OffsetRequest)
-	testEncodable(t, "no blocks", request, offsetRequestNoBlocks)
+	testRequest(t, "no blocks", request, offsetRequestNoBlocks)
 
 	request.AddBlock("foo", 4, 1, 2)
-	testEncodable(t, "one block", request, offsetRequestOneBlock)
+	testRequest(t, "one block", request, offsetRequestOneBlock)
 }

+ 4 - 5
prep_encoder.go

@@ -1,7 +1,6 @@
 package sarama
 
 import (
-	"encoding/binary"
 	"fmt"
 	"math"
 )
@@ -13,19 +12,19 @@ type prepEncoder struct {
 // primitives
 
 func (pe *prepEncoder) putInt8(in int8) {
-	pe.length += binary.Size(in)
+	pe.length += 1
 }
 
 func (pe *prepEncoder) putInt16(in int16) {
-	pe.length += binary.Size(in)
+	pe.length += 2
 }
 
 func (pe *prepEncoder) putInt32(in int32) {
-	pe.length += binary.Size(in)
+	pe.length += 4
 }
 
 func (pe *prepEncoder) putInt64(in int64) {
-	pe.length += binary.Size(in)
+	pe.length += 8
 }
 
 func (pe *prepEncoder) putArrayLength(in int) error {

+ 50 - 0
produce_request.go

@@ -54,6 +54,56 @@ func (p *ProduceRequest) encode(pe packetEncoder) error {
 	return nil
 }
 
+func (p *ProduceRequest) decode(pd packetDecoder) error {
+	requiredAcks, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	p.RequiredAcks = RequiredAcks(requiredAcks)
+	if p.Timeout, err = pd.getInt32(); err != nil {
+		return err
+	}
+	topicCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if topicCount == 0 {
+		return nil
+	}
+	p.msgSets = make(map[string]map[int32]*MessageSet)
+	for i := 0; i < topicCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitionCount, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+		p.msgSets[topic] = make(map[int32]*MessageSet)
+		for j := 0; j < partitionCount; j++ {
+			partition, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			messageSetSize, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			if messageSetSize == 0 {
+				continue
+			}
+			msgSet := &MessageSet{}
+			err = msgSet.decode(pd)
+			if err != nil {
+				return err
+			}
+			p.msgSets[topic][partition] = msgSet
+		}
+	}
+	return nil
+}
+
 func (p *ProduceRequest) key() int16 {
 	return 0
 }

+ 3 - 3
produce_request_test.go

@@ -36,12 +36,12 @@ var (
 
 func TestProduceRequest(t *testing.T) {
 	request := new(ProduceRequest)
-	testEncodable(t, "empty", request, produceRequestEmpty)
+	testRequest(t, "empty", request, produceRequestEmpty)
 
 	request.RequiredAcks = 0x123
 	request.Timeout = 0x444
-	testEncodable(t, "header", request, produceRequestHeader)
+	testRequest(t, "header", request, produceRequestHeader)
 
 	request.AddMessage("topic", 0xAD, &Message{Codec: CompressionNone, Key: nil, Value: []byte{0x00, 0xEE}})
-	testEncodable(t, "one message", request, produceRequestOneMessage)
+	testRequest(t, "one message", request, produceRequestOneMessage)
 }

+ 6 - 6
real_decoder.go

@@ -19,7 +19,7 @@ func (rd *realDecoder) getInt8() (int8, error) {
 		return -1, ErrInsufficientData
 	}
 	tmp := int8(rd.raw[rd.off])
-	rd.off += binary.Size(tmp)
+	rd.off += 1
 	return tmp, nil
 }
 
@@ -29,7 +29,7 @@ func (rd *realDecoder) getInt16() (int16, error) {
 		return -1, ErrInsufficientData
 	}
 	tmp := int16(binary.BigEndian.Uint16(rd.raw[rd.off:]))
-	rd.off += binary.Size(tmp)
+	rd.off += 2
 	return tmp, nil
 }
 
@@ -39,7 +39,7 @@ func (rd *realDecoder) getInt32() (int32, error) {
 		return -1, ErrInsufficientData
 	}
 	tmp := int32(binary.BigEndian.Uint32(rd.raw[rd.off:]))
-	rd.off += binary.Size(tmp)
+	rd.off += 4
 	return tmp, nil
 }
 
@@ -49,7 +49,7 @@ func (rd *realDecoder) getInt64() (int64, error) {
 		return -1, ErrInsufficientData
 	}
 	tmp := int64(binary.BigEndian.Uint64(rd.raw[rd.off:]))
-	rd.off += binary.Size(tmp)
+	rd.off += 8
 	return tmp, nil
 }
 
@@ -147,7 +147,7 @@ func (rd *realDecoder) getInt32Array() ([]int32, error) {
 	ret := make([]int32, n)
 	for i := range ret {
 		ret[i] = int32(binary.BigEndian.Uint32(rd.raw[rd.off:]))
-		rd.off += binary.Size(ret[i])
+		rd.off += 4
 	}
 	return ret, nil
 }
@@ -176,7 +176,7 @@ func (rd *realDecoder) getInt64Array() ([]int64, error) {
 	ret := make([]int64, n)
 	for i := range ret {
 		ret[i] = int64(binary.BigEndian.Uint64(rd.raw[rd.off:]))
-		rd.off += binary.Size(ret[i])
+		rd.off += 8
 	}
 	return ret, nil
 }

+ 4 - 4
real_encoder.go

@@ -12,22 +12,22 @@ type realEncoder struct {
 
 func (re *realEncoder) putInt8(in int8) {
 	re.raw[re.off] = byte(in)
-	re.off += binary.Size(in)
+	re.off += 1
 }
 
 func (re *realEncoder) putInt16(in int16) {
 	binary.BigEndian.PutUint16(re.raw[re.off:], uint16(in))
-	re.off += binary.Size(in)
+	re.off += 2
 }
 
 func (re *realEncoder) putInt32(in int32) {
 	binary.BigEndian.PutUint32(re.raw[re.off:], uint32(in))
-	re.off += binary.Size(in)
+	re.off += 4
 }
 
 func (re *realEncoder) putInt64(in int64) {
 	binary.BigEndian.PutUint64(re.raw[re.off:], uint64(in))
-	re.off += binary.Size(in)
+	re.off += 8
 }
 
 func (re *realEncoder) putArrayLength(in int) error {

+ 75 - 4
request.go

@@ -1,15 +1,22 @@
 package sarama
 
-type requestEncoder interface {
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+)
+
+type requestBody interface {
 	encoder
+	decoder
 	key() int16
 	version() int16
 }
 
 type request struct {
 	correlationID int32
-	id            string
-	body          requestEncoder
+	clientID      string
+	body          requestBody
 }
 
 func (r *request) encode(pe packetEncoder) (err error) {
@@ -17,7 +24,7 @@ func (r *request) encode(pe packetEncoder) (err error) {
 	pe.putInt16(r.body.key())
 	pe.putInt16(r.body.version())
 	pe.putInt32(r.correlationID)
-	err = pe.putString(r.id)
+	err = pe.putString(r.clientID)
 	if err != nil {
 		return err
 	}
@@ -27,3 +34,67 @@ func (r *request) encode(pe packetEncoder) (err error) {
 	}
 	return pe.pop()
 }
+
+func (r *request) decode(pd packetDecoder) (err error) {
+	var key int16
+	if key, err = pd.getInt16(); err != nil {
+		return err
+	}
+	var version int16
+	if version, err = pd.getInt16(); err != nil {
+		return err
+	}
+	if r.correlationID, err = pd.getInt32(); err != nil {
+		return err
+	}
+	r.clientID, err = pd.getString()
+
+	r.body = allocateBody(key, version)
+	if r.body == nil {
+		return PacketDecodingError{fmt.Sprintf("Unknown request key: %d", key)}
+	}
+	return r.body.decode(pd)
+}
+
+func decodeRequest(r io.Reader) (req *request, err error) {
+	lengthBytes := make([]byte, 4)
+	if _, err := io.ReadFull(r, lengthBytes); err != nil {
+		return nil, err
+	}
+
+	length := int32(binary.BigEndian.Uint32(lengthBytes))
+	if length <= 4 || length > MaxRequestSize {
+		return nil, PacketDecodingError{fmt.Sprintf("Message of length %d too large or too small", length)}
+	}
+
+	encodedReq := make([]byte, length)
+	if _, err := io.ReadFull(r, encodedReq); err != nil {
+		return nil, err
+	}
+
+	req = &request{}
+	if err := decode(encodedReq, req); err != nil {
+		return nil, err
+	}
+	return req, nil
+}
+
+func allocateBody(key, version int16) requestBody {
+	switch key {
+	case 0:
+		return &ProduceRequest{}
+	case 1:
+		return &FetchRequest{}
+	case 2:
+		return &OffsetRequest{}
+	case 3:
+		return &MetadataRequest{}
+	case 8:
+		return &OffsetCommitRequest{Version: version}
+	case 9:
+		return &OffsetFetchRequest{}
+	case 10:
+		return &ConsumerMetadataRequest{}
+	}
+	return nil
+}

+ 22 - 15
request_test.go

@@ -2,19 +2,10 @@ package sarama
 
 import (
 	"bytes"
+	"reflect"
 	"testing"
 )
 
-var (
-	requestSimple = []byte{
-		0x00, 0x00, 0x00, 0x17, // msglen
-		0x06, 0x66,
-		0x00, 0xD2,
-		0x00, 0x00, 0x12, 0x34,
-		0x00, 0x08, 'm', 'y', 'C', 'l', 'i', 'e', 'n', 't',
-		0x00, 0x03, 'a', 'b', 'c'}
-)
-
 type testRequestBody struct {
 }
 
@@ -30,11 +21,6 @@ func (s *testRequestBody) encode(pe packetEncoder) error {
 	return pe.putString("abc")
 }
 
-func TestRequest(t *testing.T) {
-	request := request{correlationID: 0x1234, id: "myClient", body: new(testRequestBody)}
-	testEncodable(t, "simple", &request, requestSimple)
-}
-
 // not specific to request tests, just helper functions for testing structures that
 // implement the encoder or decoder interfaces that needed somewhere to live
 
@@ -53,3 +39,24 @@ func testDecodable(t *testing.T, name string, out decoder, in []byte) {
 		t.Error("Decoding", name, "failed:", err)
 	}
 }
+
+func testRequest(t *testing.T, name string, rb requestBody, expected []byte) {
+	// Encoder request
+	req := &request{correlationID: 123, clientID: "foo", body: rb}
+	packet, err := encode(req)
+	headerSize := 14 + len("foo")
+	if err != nil {
+		t.Error(err)
+	} else if !bytes.Equal(packet[headerSize:], expected) {
+		t.Error("Encoding", name, "failed\ngot ", packet, "\nwant", expected)
+	}
+	// Decoder request
+	decoded, err := decodeRequest(bytes.NewReader(packet))
+	if err != nil {
+		t.Error("Failed to decode request", err)
+	} else if decoded.correlationID != 123 || decoded.clientID != "foo" {
+		t.Errorf("Decoded header is not valid: %v", decoded)
+	} else if !reflect.DeepEqual(rb, decoded.body) {
+		t.Errorf("Decoded request does not match the encoded one\n    encoded: %v\n    decoded: %v", rb, decoded)
+	}
+}

+ 4 - 3
sarama.go

@@ -40,7 +40,8 @@ var PanicHandler func(interface{})
 var MaxRequestSize int32 = 100 * 1024 * 1024
 
 // MaxResponseSize is the maximum size (in bytes) of any response that Sarama will attempt to parse. If
-// a broker returns a response message larger than this value, Sarama will return a PacketDecodingError. The
-// default of 100 MiB is aligned with Kafka's default `socket.request.max.bytes`, which is the largest
-// request the broker will attempt to process.
+// a broker returns a response message larger than this value, Sarama will return a PacketDecodingError to
+// protect the client from running out of memory. Please note that brokers do not have any natural limit on
+// the size of responses they send. In particular, they can send arbitrarily large fetch responses to consumers
+// (see https://issues.apache.org/jira/browse/KAFKA-2063).
 var MaxResponseSize int32 = 100 * 1024 * 1024

+ 12 - 8
snappy.go

@@ -2,14 +2,15 @@ package sarama
 
 import (
 	"bytes"
-	"code.google.com/p/snappy-go/snappy"
 	"encoding/binary"
+
+	"github.com/golang/snappy"
 )
 
 var snappyMagic = []byte{130, 83, 78, 65, 80, 80, 89, 0}
 
 // SnappyEncode encodes binary data
-func snappyEncode(src []byte) ([]byte, error) {
+func snappyEncode(src []byte) []byte {
 	return snappy.Encode(nil, src)
 }
 
@@ -17,18 +18,21 @@ func snappyEncode(src []byte) ([]byte, error) {
 func snappyDecode(src []byte) ([]byte, error) {
 	if bytes.Equal(src[:8], snappyMagic) {
 		var (
-			pos = uint32(16)
-			max = uint32(len(src))
-			dst []byte
+			pos   = uint32(16)
+			max   = uint32(len(src))
+			dst   = make([]byte, 0, len(src))
+			chunk []byte
+			err   error
 		)
 		for pos < max {
 			size := binary.BigEndian.Uint32(src[pos : pos+4])
-			pos = pos + 4
-			chunk, err := snappy.Decode(nil, src[pos:pos+size])
+			pos += 4
+
+			chunk, err = snappy.Decode(chunk, src[pos:pos+size])
 			if err != nil {
 				return nil, err
 			}
-			pos = pos + size
+			pos += size
 			dst = append(dst, chunk...)
 		}
 		return dst, nil

+ 2 - 4
snappy_test.go

@@ -19,10 +19,8 @@ var snappyStreamTestCases = map[string][]byte{
 
 func TestSnappyEncode(t *testing.T) {
 	for src, exp := range snappyTestCases {
-		dst, err := snappyEncode([]byte(src))
-		if err != nil {
-			t.Error("Encoding error: ", err)
-		} else if !bytes.Equal(dst, exp) {
+		dst := snappyEncode([]byte(src))
+		if !bytes.Equal(dst, exp) {
 			t.Errorf("Expected %s to generate %v, but was %v", src, exp, dst)
 		}
 	}

+ 4 - 1
tools/README.md

@@ -4,4 +4,7 @@ This folder contains applications that are useful for exploration of your Kafka
 Some of these tools mirror tools that ship with Kafka, but these tools won't require installing the JVM to function.
 
 - [kafka-console-producer](./kafka-console-producer): a command line tool to produce a single message to your Kafka custer.
-- [kafka-console-partitionconsumer](./kafka-console-producer): a command line tool to consume a single partition of a topic on your Kafka cluster.
+- [kafka-console-partitionconsumer](./kafka-console-partitionconsumer): (deprecated) a command line tool to consume a single partition of a topic on your Kafka cluster.
+- [kafka-console-consumer](./kafka-console-consumer): a command line tool to consume arbitrary partitions of a topic on your Kafka cluster.
+
+To install all tools, run `go get github.com/Shopify/sarama/tools/...`

+ 2 - 0
tools/kafka-console-consumer/.gitignore

@@ -0,0 +1,2 @@
+kafka-console-consumer
+kafka-console-consumer.test

+ 29 - 0
tools/kafka-console-consumer/README.md

@@ -0,0 +1,29 @@
+# kafka-console-consumer
+
+A simple command line tool to consume partitions of a topic and print the
+messages on the standard output.
+
+### Installation
+
+    go get github.com/Shopify/sarama/tools/kafka-console-consumer
+
+### Usage
+
+    # Minimum invocation
+    kafka-console-consumer -topic=test -brokers=kafka1:9092
+
+    # It will pick up a KAFKA_PEERS environment variable
+    export KAFKA_PEERS=kafka1:9092,kafka2:9092,kafka3:9092
+    kafka-console-consumer -topic=test
+
+    # You can specify the offset you want to start at. It can be either
+    # `oldest`, `newest`. The default is `newest`.
+    kafka-console-consumer -topic=test -offset=oldest
+    kafka-console-consumer -topic=test -offset=newest
+
+    # You can specify the partition(s) you want to consume as a comma-separated
+    # list. The default is `all`.
+    kafka-console-consumer -topic=test -partitions=1,2,3
+
+    # Display all command line options
+    kafka-console-consumer -help

+ 145 - 0
tools/kafka-console-consumer/kafka-console-consumer.go

@@ -0,0 +1,145 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+	"os/signal"
+	"strconv"
+	"strings"
+	"sync"
+
+	"github.com/Shopify/sarama"
+)
+
+var (
+	brokerList = flag.String("brokers", os.Getenv("KAFKA_PEERS"), "The comma separated list of brokers in the Kafka cluster")
+	topic      = flag.String("topic", "", "REQUIRED: the topic to consume")
+	partitions = flag.String("partitions", "all", "The partitions to consume, can be 'all' or comma-separated numbers")
+	offset     = flag.String("offset", "newest", "The offset to start with. Can be `oldest`, `newest`")
+	verbose    = flag.Bool("verbose", false, "Whether to turn on sarama logging")
+	bufferSize = flag.Int("buffer-size", 256, "The buffer size of the message channel.")
+
+	logger = log.New(os.Stderr, "", log.LstdFlags)
+)
+
+func main() {
+	flag.Parse()
+
+	if *brokerList == "" {
+		printUsageErrorAndExit("You have to provide -brokers as a comma-separated list, or set the KAFKA_PEERS environment variable.")
+	}
+
+	if *topic == "" {
+		printUsageErrorAndExit("-topic is required")
+	}
+
+	if *verbose {
+		sarama.Logger = logger
+	}
+
+	var initialOffset int64
+	switch *offset {
+	case "oldest":
+		initialOffset = sarama.OffsetOldest
+	case "newest":
+		initialOffset = sarama.OffsetNewest
+	default:
+		printUsageErrorAndExit("-offset should be `oldest` or `newest`")
+	}
+
+	c, err := sarama.NewConsumer(strings.Split(*brokerList, ","), nil)
+	if err != nil {
+		printErrorAndExit(69, "Failed to start consumer: %s", err)
+	}
+
+	partitionList, err := getPartitions(c)
+	if err != nil {
+		printErrorAndExit(69, "Failed to get the list of partitions: %s", err)
+	}
+
+	var (
+		messages = make(chan *sarama.ConsumerMessage, *bufferSize)
+		closing  = make(chan struct{})
+		wg       sync.WaitGroup
+	)
+
+	go func() {
+		signals := make(chan os.Signal, 1)
+		signal.Notify(signals, os.Kill, os.Interrupt)
+		<-signals
+		logger.Println("Initiating shutdown of consumer...")
+		close(closing)
+	}()
+
+	for _, partition := range partitionList {
+		pc, err := c.ConsumePartition(*topic, partition, initialOffset)
+		if err != nil {
+			printErrorAndExit(69, "Failed to start consumer for partition %d: %s", partition, err)
+		}
+
+		go func(pc sarama.PartitionConsumer) {
+			<-closing
+			pc.AsyncClose()
+		}(pc)
+
+		wg.Add(1)
+		go func(pc sarama.PartitionConsumer) {
+			defer wg.Done()
+			for message := range pc.Messages() {
+				messages <- message
+			}
+		}(pc)
+	}
+
+	go func() {
+		for msg := range messages {
+			fmt.Printf("Partition:\t%d\n", msg.Partition)
+			fmt.Printf("Offset:\t%d\n", msg.Offset)
+			fmt.Printf("Key:\t%s\n", string(msg.Key))
+			fmt.Printf("Value:\t%s\n", string(msg.Value))
+			fmt.Println()
+		}
+	}()
+
+	wg.Wait()
+	logger.Println("Done consuming topic", *topic)
+	close(messages)
+
+	if err := c.Close(); err != nil {
+		logger.Println("Failed to close consumer: ", err)
+	}
+}
+
+func getPartitions(c sarama.Consumer) ([]int32, error) {
+	if *partitions == "all" {
+		return c.Partitions(*topic)
+	}
+
+	tmp := strings.Split(*partitions, ",")
+	var pList []int32
+	for i := range tmp {
+		val, err := strconv.ParseInt(tmp[i], 10, 32)
+		if err != nil {
+			return nil, err
+		}
+		pList = append(pList, int32(val))
+	}
+
+	return pList, nil
+}
+
+func printErrorAndExit(code int, format string, values ...interface{}) {
+	fmt.Fprintf(os.Stderr, "ERROR: %s\n", fmt.Sprintf(format, values...))
+	fmt.Fprintln(os.Stderr)
+	os.Exit(code)
+}
+
+func printUsageErrorAndExit(format string, values ...interface{}) {
+	fmt.Fprintf(os.Stderr, "ERROR: %s\n", fmt.Sprintf(format, values...))
+	fmt.Fprintln(os.Stderr)
+	fmt.Fprintln(os.Stderr, "Available command line options:")
+	flag.PrintDefaults()
+	os.Exit(64)
+}

+ 4 - 1
tools/kafka-console-partitionconsumer/README.md

@@ -1,11 +1,14 @@
 # kafka-console-partitionconsumer
 
+NOTE: this tool is deprecated in favour of the more general and more powerful
+`kafka-console-consumer`.
+
 A simple command line tool to consume a partition of a topic and print the messages
 on the standard output.
 
 ### Installation
 
-    go install github.com/Shopify/sarama/tools/kafka-console-partitionconsumer
+    go get github.com/Shopify/sarama/tools/kafka-console-partitionconsumer
 
 ### Usage
 

+ 1 - 1
tools/kafka-console-producer/README.md

@@ -4,7 +4,7 @@ A simple command line tool to produce a single message to Kafka.
 
 ### Installation
 
-    go install github.com/Shopify/sarama/tools/kafka-console-producer
+    go get github.com/Shopify/sarama/tools/kafka-console-producer
 
 
 ### Usage

+ 4 - 2
utils.go

@@ -45,8 +45,10 @@ func withRecover(fn func()) {
 func safeAsyncClose(b *Broker) {
 	tmp := b // local var prevents clobbering in goroutine
 	go withRecover(func() {
-		if err := tmp.Close(); err != nil {
-			Logger.Println("Error closing broker", tmp.ID(), ":", err)
+		if connected, _ := tmp.Connected(); connected {
+			if err := tmp.Close(); err != nil {
+				Logger.Println("Error closing broker", tmp.ID(), ":", err)
+			}
 		}
 	})
 }

+ 12 - 3
vagrant/boot_cluster.sh

@@ -1,13 +1,22 @@
-#/bin/sh
+#!/bin/sh
 
 set -ex
 
+# Launch and wait for toxiproxy
+vagrant/run_toxiproxy.sh &
+while ! nc -q 1 localhost 2181 </dev/null; do echo "Waiting"; sleep 1; done
+while ! nc -q 1 localhost 9092 </dev/null; do echo "Waiting"; sleep 1; done
+
+# Launch and wait for Zookeeper
 for i in 1 2 3 4 5; do
-    KAFKA_PORT=`expr $i + 6666`
+    KAFKA_PORT=`expr $i + 9090`
     cd ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT} && bin/zookeeper-server-start.sh -daemon config/zookeeper.properties
 done
+while ! nc -q 1 localhost 21805 </dev/null; do echo "Waiting"; sleep 1; done
 
+# Launch and wait for Kafka
 for i in 1 2 3 4 5; do
-    KAFKA_PORT=`expr $i + 6666`
+    KAFKA_PORT=`expr $i + 9090`
     cd ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT} && bin/kafka-server-start.sh -daemon config/server.properties
 done
+while ! nc -q 1 localhost 29095 </dev/null; do echo "Waiting"; sleep 1; done

+ 4 - 7
vagrant/create_topics.sh

@@ -2,10 +2,7 @@
 
 set -ex
 
-sleep 10
-
-cd ${KAFKA_INSTALL_ROOT}/kafka-6667
-bin/kafka-topics.sh --create --partitions 1 --replication-factor 3 --topic single_partition --zookeeper localhost:2181
-bin/kafka-topics.sh --create --partitions 2 --replication-factor 3 --topic multi_partition  --zookeeper localhost:2181
-
-sleep 5
+cd ${KAFKA_INSTALL_ROOT}/kafka-9092
+bin/kafka-topics.sh --create --partitions 1 --replication-factor 3 --topic test.1 --zookeeper localhost:2181
+bin/kafka-topics.sh --create --partitions 4 --replication-factor 3 --topic test.4 --zookeeper localhost:2181
+bin/kafka-topics.sh --create --partitions 64 --replication-factor 3 --topic test.64  --zookeeper localhost:2181

+ 15 - 3
vagrant/install_cluster.sh

@@ -1,15 +1,25 @@
-#/bin/sh
+#!/bin/sh
 
 set -ex
 
+TOXIPROXY_VERSION=1.0.3
+
 mkdir -p ${KAFKA_INSTALL_ROOT}
 if [ ! -f ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_VERSION}.tgz ]; then
     wget --quiet http://apache.mirror.gtcomm.net/kafka/${KAFKA_VERSION}/kafka_2.10-${KAFKA_VERSION}.tgz -O ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_VERSION}.tgz
 fi
+if [ ! -f ${KAFKA_INSTALL_ROOT}/toxiproxy-${TOXIPROXY_VERSION} ]; then
+    wget --quiet https://github.com/Shopify/toxiproxy/releases/download/v${TOXIPROXY_VERSION}/toxiproxy-linux-amd64 -O ${KAFKA_INSTALL_ROOT}/toxiproxy-${TOXIPROXY_VERSION}
+    chmod +x ${KAFKA_INSTALL_ROOT}/toxiproxy-${TOXIPROXY_VERSION}
+fi
+rm -f ${KAFKA_INSTALL_ROOT}/toxiproxy
+ln -s ${KAFKA_INSTALL_ROOT}/toxiproxy-${TOXIPROXY_VERSION} ${KAFKA_INSTALL_ROOT}/toxiproxy
 
 for i in 1 2 3 4 5; do
     ZK_PORT=`expr $i + 2180`
-    KAFKA_PORT=`expr $i + 6666`
+    ZK_PORT_REAL=`expr $i + 21800`
+    KAFKA_PORT=`expr $i + 9090`
+    KAFKA_PORT_REAL=`expr $i + 29090`
 
     # unpack kafka
     mkdir -p ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}
@@ -18,7 +28,9 @@ for i in 1 2 3 4 5; do
     # broker configuration
     cp ${REPOSITORY_ROOT}/vagrant/server.properties ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/
     sed -i s/KAFKAID/${KAFKA_PORT}/g ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/server.properties
+    sed -i s/KAFKAPORT/${KAFKA_PORT_REAL}/g ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/server.properties
     sed -i s/KAFKA_HOSTNAME/${KAFKA_HOSTNAME}/g ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/server.properties
+    sed -i s/ZK_PORT/${ZK_PORT}/g ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/server.properties
 
     KAFKA_DATADIR="${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/data"
     mkdir -p ${KAFKA_DATADIR}
@@ -27,7 +39,7 @@ for i in 1 2 3 4 5; do
     # zookeeper configuration
     cp ${REPOSITORY_ROOT}/vagrant/zookeeper.properties ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/
     sed -i s/KAFKAID/${KAFKA_PORT}/g ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/zookeeper.properties
-    sed -i s/ZK_PORT/${ZK_PORT}/g ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/zookeeper.properties
+    sed -i s/ZK_PORT/${ZK_PORT_REAL}/g ${KAFKA_INSTALL_ROOT}/kafka-${KAFKA_PORT}/config/zookeeper.properties
 
     ZK_DATADIR="${KAFKA_INSTALL_ROOT}/zookeeper-${ZK_PORT}"
     mkdir -p ${ZK_DATADIR}

+ 22 - 0
vagrant/run_toxiproxy.sh

@@ -0,0 +1,22 @@
+#!/bin/sh
+
+set -ex
+
+${KAFKA_INSTALL_ROOT}/toxiproxy -port 8474 -host 0.0.0.0 &
+PID=$!
+
+while ! nc -q 1 localhost 8474 </dev/null; do echo "Waiting"; sleep 1; done
+
+wget -O/dev/null -S --post-data='{"name":"zk1", "upstream":"localhost:21801", "listen":"0.0.0.0:2181"}' localhost:8474/proxies
+wget -O/dev/null -S --post-data='{"name":"zk2", "upstream":"localhost:21802", "listen":"0.0.0.0:2182"}' localhost:8474/proxies
+wget -O/dev/null -S --post-data='{"name":"zk3", "upstream":"localhost:21803", "listen":"0.0.0.0:2183"}' localhost:8474/proxies
+wget -O/dev/null -S --post-data='{"name":"zk4", "upstream":"localhost:21804", "listen":"0.0.0.0:2184"}' localhost:8474/proxies
+wget -O/dev/null -S --post-data='{"name":"zk5", "upstream":"localhost:21805", "listen":"0.0.0.0:2185"}' localhost:8474/proxies
+
+wget -O/dev/null -S --post-data='{"name":"kafka1", "upstream":"localhost:29091", "listen":"0.0.0.0:9091"}' localhost:8474/proxies
+wget -O/dev/null -S --post-data='{"name":"kafka2", "upstream":"localhost:29092", "listen":"0.0.0.0:9092"}' localhost:8474/proxies
+wget -O/dev/null -S --post-data='{"name":"kafka3", "upstream":"localhost:29093", "listen":"0.0.0.0:9093"}' localhost:8474/proxies
+wget -O/dev/null -S --post-data='{"name":"kafka4", "upstream":"localhost:29094", "listen":"0.0.0.0:9094"}' localhost:8474/proxies
+wget -O/dev/null -S --post-data='{"name":"kafka5", "upstream":"localhost:29095", "listen":"0.0.0.0:9095"}' localhost:8474/proxies
+
+wait $PID

+ 8 - 6
vagrant/server.properties

@@ -22,15 +22,16 @@ broker.id=KAFKAID
 ############################# Socket Server Settings #############################
 
 # The port the socket server listens on
-port=KAFKAID
+port=KAFKAPORT
 
 # Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
+host.name=localhost
 
 # Hostname the broker will advertise to producers and consumers. If not set, it uses the
 # value for "host.name" if configured.  Otherwise, it will use the value returned from
 # java.net.InetAddress.getCanonicalHostName().
 advertised.host.name=KAFKA_HOSTNAME
+advertised.port=KAFKAID
 
 # The port to publish to ZooKeeper for clients to use. If this is not set,
 # it will publish the same port that the broker binds to.
@@ -98,10 +99,10 @@ log.retention.hours=168
 
 # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
 # segments don't drop below log.retention.bytes.
-log.retention.bytes=1073741824
+log.retention.bytes=268435456
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
-log.segment.bytes=536870912
+log.segment.bytes=268435456
 
 # The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
@@ -118,7 +119,8 @@ log.cleaner.enable=false
 # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
 # You can also append an optional chroot string to the urls to specify the
 # root directory for all kafka znodes.
-zookeeper.connect=localhost:2181,localhost:2182,localhost:2183,localhost:2184,localhost:2185
+zookeeper.connect=localhost:ZK_PORT
 
 # Timeout in ms for connecting to zookeeper
-zookeeper.connection.timeout.ms=1000000
+zookeeper.session.timeout.ms=3000
+zookeeper.connection.timeout.ms=3000

+ 10 - 2
vagrant/setup_services.sh

@@ -1,10 +1,15 @@
-#/bin/sh
+#!/bin/sh
 
 set -ex
 
+stop toxiproxy || true
+cp ${REPOSITORY_ROOT}/vagrant/toxiproxy.conf /etc/init/toxiproxy.conf
+cp ${REPOSITORY_ROOT}/vagrant/run_toxiproxy.sh ${KAFKA_INSTALL_ROOT}/
+start toxiproxy
+
 for i in 1 2 3 4 5; do
     ZK_PORT=`expr $i + 2180`
-    KAFKA_PORT=`expr $i + 6666`
+    KAFKA_PORT=`expr $i + 9090`
 
     stop zookeeper-${ZK_PORT} || true
 
@@ -19,3 +24,6 @@ for i in 1 2 3 4 5; do
 
     start zookeeper-${ZK_PORT}
 done
+
+# Wait for the last kafka node to finish booting
+while ! nc -q 1 localhost 29095 </dev/null; do echo "Waiting"; sleep 1; done

+ 6 - 0
vagrant/toxiproxy.conf

@@ -0,0 +1,6 @@
+start on started networking
+stop on shutdown
+
+env KAFKA_INSTALL_ROOT=/opt
+
+exec /opt/run_toxiproxy.sh

+ 2 - 2
vagrant/zookeeper.conf

@@ -1,4 +1,4 @@
-start on filesystem or runlevel [2345]
-stop on runlevel [!2345]
+start on started toxiproxy
+stop on stopping toxiproxy
 
 exec /opt/kafka-KAFKAID/bin/zookeeper-server-start.sh /opt/kafka-KAFKAID/config/zookeeper.properties