Browse Source

Merge pull request #2831 from xiang90/index

storage: initial index and key index
Xiang Li 10 years ago
parent
commit
9be6a7c8fd
59 changed files with 13260 additions and 6 deletions
  1. 14 5
      Godeps/Godeps.json
  2. 3 0
      Godeps/_workspace/src/github.com/boltdb/bolt/.gitignore
  3. 20 0
      Godeps/_workspace/src/github.com/boltdb/bolt/LICENSE
  4. 54 0
      Godeps/_workspace/src/github.com/boltdb/bolt/Makefile
  5. 591 0
      Godeps/_workspace/src/github.com/boltdb/bolt/README.md
  6. 135 0
      Godeps/_workspace/src/github.com/boltdb/bolt/batch.go
  7. 170 0
      Godeps/_workspace/src/github.com/boltdb/bolt/batch_benchmark_test.go
  8. 148 0
      Godeps/_workspace/src/github.com/boltdb/bolt/batch_example_test.go
  9. 167 0
      Godeps/_workspace/src/github.com/boltdb/bolt/batch_test.go
  10. 7 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bolt_386.go
  11. 7 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bolt_amd64.go
  12. 7 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bolt_arm.go
  13. 12 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go
  14. 29 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go
  15. 36 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bolt_test.go
  16. 80 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bolt_unix.go
  17. 74 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go
  18. 10 0
      Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go
  19. 743 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go
  20. 1153 0
      Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go
  21. BIN
      Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/bolt
  22. 1529 0
      Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main.go
  23. 145 0
      Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main_test.go
  24. BIN
      Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/p.out
  25. 384 0
      Godeps/_workspace/src/github.com/boltdb/bolt/cursor.go
  26. 511 0
      Godeps/_workspace/src/github.com/boltdb/bolt/cursor_test.go
  27. 732 0
      Godeps/_workspace/src/github.com/boltdb/bolt/db.go
  28. 790 0
      Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go
  29. 44 0
      Godeps/_workspace/src/github.com/boltdb/bolt/doc.go
  30. 66 0
      Godeps/_workspace/src/github.com/boltdb/bolt/errors.go
  31. 241 0
      Godeps/_workspace/src/github.com/boltdb/bolt/freelist.go
  32. 129 0
      Godeps/_workspace/src/github.com/boltdb/bolt/freelist_test.go
  33. 627 0
      Godeps/_workspace/src/github.com/boltdb/bolt/node.go
  34. 156 0
      Godeps/_workspace/src/github.com/boltdb/bolt/node_test.go
  35. 134 0
      Godeps/_workspace/src/github.com/boltdb/bolt/page.go
  36. 29 0
      Godeps/_workspace/src/github.com/boltdb/bolt/page_test.go
  37. 79 0
      Godeps/_workspace/src/github.com/boltdb/bolt/quick_test.go
  38. 327 0
      Godeps/_workspace/src/github.com/boltdb/bolt/simulation_test.go
  39. 585 0
      Godeps/_workspace/src/github.com/boltdb/bolt/tx.go
  40. 424 0
      Godeps/_workspace/src/github.com/boltdb/bolt/tx_test.go
  41. 1 0
      Godeps/_workspace/src/github.com/google/btree/.travis.yml
  42. 202 0
      Godeps/_workspace/src/github.com/google/btree/LICENSE
  43. 12 0
      Godeps/_workspace/src/github.com/google/btree/README.md
  44. 571 0
      Godeps/_workspace/src/github.com/google/btree/btree.go
  45. 76 0
      Godeps/_workspace/src/github.com/google/btree/btree_mem.go
  46. 293 0
      Godeps/_workspace/src/github.com/google/btree/btree_test.go
  47. 1 1
      scripts/genproto.sh
  48. 83 0
      storage/backend/backend.go
  49. 36 0
      storage/backend/backend_bench_test.go
  50. 29 0
      storage/backend/backend_test.go
  51. 112 0
      storage/backend/batch_tx.go
  52. 99 0
      storage/index.go
  53. 137 0
      storage/index_test.go
  54. 205 0
      storage/key_index.go
  55. 364 0
      storage/key_index_test.go
  56. 132 0
      storage/kv.go
  57. 24 0
      storage/kv_test.go
  58. 456 0
      storage/storagepb/kv.pb.go
  59. 35 0
      storage/storagepb/kv.proto

+ 14 - 5
Godeps/Godeps.json

@@ -1,6 +1,6 @@
 {
 	"ImportPath": "github.com/coreos/etcd",
-	"GoVersion": "go1.4.2",
+	"GoVersion": "go1.4.1",
 	"Packages": [
 		"./..."
 	],
@@ -10,6 +10,11 @@
 			"Comment": "go.r60-163",
 			"Rev": "9352842ae63ee1d7e74e074ce7bb10370c4b6b9e"
 		},
+		{
+			"ImportPath": "github.com/boltdb/bolt",
+			"Comment": "v1.0-71-g71f28ea",
+			"Rev": "71f28eaecbebd00604d87bb1de0dae8fcfa54bbd"
+		},
 		{
 			"ImportPath": "github.com/codegangsta/cli",
 			"Comment": "1.2.0-26-gf7ebb76",
@@ -20,14 +25,14 @@
 			"Comment": "v0.2.0-rc1-130-g6aa2da5",
 			"Rev": "6aa2da5a7a905609c93036b9307185a04a5a84a5"
 		},
-		{
-			"ImportPath": "github.com/coreos/pkg/capnslog",
-			"Rev": "9d5dd4632f9ece71bdf83d31253593a633e73df5"
-		},
 		{
 			"ImportPath": "github.com/coreos/go-semver/semver",
 			"Rev": "568e959cd89871e61434c1143528d9162da89ef2"
 		},
+		{
+			"ImportPath": "github.com/coreos/pkg/capnslog",
+			"Rev": "9d5dd4632f9ece71bdf83d31253593a633e73df5"
+		},
 		{
 			"ImportPath": "github.com/gogo/protobuf/proto",
 			"Rev": "bc946d07d1016848dfd2507f90f0859c9471681e"
@@ -36,6 +41,10 @@
 			"ImportPath": "github.com/golang/protobuf/proto",
 			"Rev": "5677a0e3d5e89854c9974e1256839ee23f8233ca"
 		},
+		{
+			"ImportPath": "github.com/google/btree",
+			"Rev": "cc6329d4279e3f025a53a83c397d2339b5705c45"
+		},
 		{
 			"ImportPath": "github.com/jonboulle/clockwork",
 			"Rev": "72f9bd7c4e0c2a40055ab3d0f09654f730cce982"

+ 3 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/.gitignore

@@ -0,0 +1,3 @@
+*.prof
+*.test
+/bin/

+ 20 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/LICENSE

@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2013 Ben Johnson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 54 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/Makefile

@@ -0,0 +1,54 @@
+TEST=.
+BENCH=.
+COVERPROFILE=/tmp/c.out
+BRANCH=`git rev-parse --abbrev-ref HEAD`
+COMMIT=`git rev-parse --short HEAD`
+GOLDFLAGS="-X main.branch $(BRANCH) -X main.commit $(COMMIT)"
+
+default: build
+
+bench:
+	go test -v -test.run=NOTHINCONTAINSTHIS -test.bench=$(BENCH)
+
+# http://cloc.sourceforge.net/
+cloc:
+	@cloc --not-match-f='Makefile|_test.go' .
+
+cover: fmt
+	go test -coverprofile=$(COVERPROFILE) -test.run=$(TEST) $(COVERFLAG) .
+	go tool cover -html=$(COVERPROFILE)
+	rm $(COVERPROFILE)
+
+cpuprofile: fmt
+	@go test -c
+	@./bolt.test -test.v -test.run=$(TEST) -test.cpuprofile cpu.prof
+
+# go get github.com/kisielk/errcheck
+errcheck:
+	@echo "=== errcheck ==="
+	@errcheck github.com/boltdb/bolt
+
+fmt:
+	@go fmt ./...
+
+get:
+	@go get -d ./...
+
+build: get
+	@mkdir -p bin
+	@go build -ldflags=$(GOLDFLAGS) -a -o bin/bolt ./cmd/bolt
+
+test: fmt
+	@go get github.com/stretchr/testify/assert
+	@echo "=== TESTS ==="
+	@go test -v -cover -test.run=$(TEST)
+	@echo ""
+	@echo ""
+	@echo "=== CLI ==="
+	@go test -v -test.run=$(TEST) ./cmd/bolt
+	@echo ""
+	@echo ""
+	@echo "=== RACE DETECTOR ==="
+	@go test -v -race -test.run="TestSimulate_(100op|1000op)"
+
+.PHONY: bench cloc cover cpuprofile fmt memprofile test

+ 591 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/README.md

@@ -0,0 +1,591 @@
+Bolt [![Build Status](https://drone.io/github.com/boltdb/bolt/status.png)](https://drone.io/github.com/boltdb/bolt/latest) [![Coverage Status](https://coveralls.io/repos/boltdb/bolt/badge.png?branch=master)](https://coveralls.io/r/boltdb/bolt?branch=master) [![GoDoc](https://godoc.org/github.com/boltdb/bolt?status.png)](https://godoc.org/github.com/boltdb/bolt) ![Version](http://img.shields.io/badge/version-1.0-green.png)
+====
+
+Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas] and
+the [LMDB project][lmdb]. The goal of the project is to provide a simple,
+fast, and reliable database for projects that don't require a full database
+server such as Postgres or MySQL.
+
+Since Bolt is meant to be used as such a low-level piece of functionality,
+simplicity is key. The API will be small and only focus on getting values
+and setting values. That's it.
+
+[hyc_symas]: https://twitter.com/hyc_symas
+[lmdb]: http://symas.com/mdb/
+
+
+## Project Status
+
+Bolt is stable and the API is fixed. Full unit test coverage and randomized
+black box testing are used to ensure database consistency and thread safety.
+Bolt is currently in high-load production environments serving databases as
+large as 1TB. Many companies such as Shopify and Heroku use Bolt-backed
+services every day.
+
+
+## Getting Started
+
+### Installing
+
+To start using Bolt, install Go and run `go get`:
+
+```sh
+$ go get github.com/boltdb/bolt/...
+```
+
+This will retrieve the library and install the `bolt` command line utility into
+your `$GOBIN` path.
+
+
+### Opening a database
+
+The top-level object in Bolt is a `DB`. It is represented as a single file on
+your disk and represents a consistent snapshot of your data.
+
+To open your database, simply use the `bolt.Open()` function:
+
+```go
+package main
+
+import (
+	"log"
+
+	"github.com/boltdb/bolt"
+)
+
+func main() {
+	// Open the my.db data file in your current directory.
+	// It will be created if it doesn't exist.
+	db, err := bolt.Open("my.db", 0600, nil)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer db.Close()
+
+	...
+}
+```
+
+Please note that Bolt obtains a file lock on the data file so multiple processes
+cannot open the same database at the same time. Opening an already open Bolt
+database will cause it to hang until the other process closes it. To prevent
+an indefinite wait you can pass a timeout option to the `Open()` function:
+
+```go
+db, err := bolt.Open("my.db", 0600, &bolt.Options{Timeout: 1 * time.Second})
+```
+
+
+### Transactions
+
+Bolt allows only one read-write transaction at a time but allows as many
+read-only transactions as you want at a time. Each transaction has a consistent
+view of the data as it existed when the transaction started.
+
+Individual transactions and all objects created from them (e.g. buckets, keys)
+are not thread safe. To work with data in multiple goroutines you must start
+a transaction for each one or use locking to ensure only one goroutine accesses
+a transaction at a time. Creating transaction from the `DB` is thread safe.
+
+
+#### Read-write transactions
+
+To start a read-write transaction, you can use the `DB.Update()` function:
+
+```go
+err := db.Update(func(tx *bolt.Tx) error {
+	...
+	return nil
+})
+```
+
+Inside the closure, you have a consistent view of the database. You commit the
+transaction by returning `nil` at the end. You can also rollback the transaction
+at any point by returning an error. All database operations are allowed inside
+a read-write transaction.
+
+Always check the return error as it will report any disk failures that can cause
+your transaction to not complete. If you return an error within your closure
+it will be passed through.
+
+
+#### Read-only transactions
+
+To start a read-only transaction, you can use the `DB.View()` function:
+
+```go
+err := db.View(func(tx *bolt.Tx) error {
+	...
+	return nil
+})
+```
+
+You also get a consistent view of the database within this closure, however,
+no mutating operations are allowed within a read-only transaction. You can only
+retrieve buckets, retrieve values, and copy the database within a read-only
+transaction.
+
+
+#### Batch read-write transactions
+
+Each `DB.Update()` waits for disk to commit the writes. This overhead
+can be minimized by combining multiple updates with the `DB.Batch()`
+function:
+
+```go
+err := db.Batch(func(tx *bolt.Tx) error {
+	...
+	return nil
+})
+```
+
+Concurrent Batch calls are opportunistically combined into larger
+transactions. Batch is only useful when there are multiple goroutines
+calling it.
+
+The trade-off is that `Batch` can call the given
+function multiple times, if parts of the transaction fail. The
+function must be idempotent and side effects must take effect only
+after a successful return from `DB.Batch()`.
+
+For example: don't display messages from inside the function, instead
+set variables in the enclosing scope:
+
+```go
+var id uint64
+err := db.Batch(func(tx *bolt.Tx) error {
+	// Find last key in bucket, decode as bigendian uint64, increment
+	// by one, encode back to []byte, and add new key.
+	...
+	id = newValue
+	return nil
+})
+if err != nil {
+	return ...
+}
+fmt.Println("Allocated ID %d", id)
+```
+
+
+#### Managing transactions manually
+
+The `DB.View()` and `DB.Update()` functions are wrappers around the `DB.Begin()`
+function. These helper functions will start the transaction, execute a function,
+and then safely close your transaction if an error is returned. This is the
+recommended way to use Bolt transactions.
+
+However, sometimes you may want to manually start and end your transactions.
+You can use the `Tx.Begin()` function directly but _please_ be sure to close the
+transaction.
+
+```go
+// Start a writable transaction.
+tx, err := db.Begin(true)
+if err != nil {
+    return err
+}
+defer tx.Rollback()
+
+// Use the transaction...
+_, err := tx.CreateBucket([]byte("MyBucket"))
+if err != nil {
+    return err
+}
+
+// Commit the transaction and check for error.
+if err := tx.Commit(); err != nil {
+    return err
+}
+```
+
+The first argument to `DB.Begin()` is a boolean stating if the transaction
+should be writable.
+
+
+### Using buckets
+
+Buckets are collections of key/value pairs within the database. All keys in a
+bucket must be unique. You can create a bucket using the `DB.CreateBucket()`
+function:
+
+```go
+db.Update(func(tx *bolt.Tx) error {
+	b, err := tx.CreateBucket([]byte("MyBucket"))
+	if err != nil {
+		return fmt.Errorf("create bucket: %s", err)
+	}
+	return nil
+})
+```
+
+You can also create a bucket only if it doesn't exist by using the
+`Tx.CreateBucketIfNotExists()` function. It's a common pattern to call this
+function for all your top-level buckets after you open your database so you can
+guarantee that they exist for future transactions.
+
+To delete a bucket, simply call the `Tx.DeleteBucket()` function.
+
+
+### Using key/value pairs
+
+To save a key/value pair to a bucket, use the `Bucket.Put()` function:
+
+```go
+db.Update(func(tx *bolt.Tx) error {
+	b := tx.Bucket([]byte("MyBucket"))
+	err := b.Put([]byte("answer"), []byte("42"))
+	return err
+})
+```
+
+This will set the value of the `"answer"` key to `"42"` in the `MyBucket`
+bucket. To retrieve this value, we can use the `Bucket.Get()` function:
+
+```go
+db.View(func(tx *bolt.Tx) error {
+	b := tx.Bucket([]byte("MyBucket"))
+	v := b.Get([]byte("answer"))
+	fmt.Printf("The answer is: %s\n", v)
+	return nil
+})
+```
+
+The `Get()` function does not return an error because its operation is
+guarenteed to work (unless there is some kind of system failure). If the key
+exists then it will return its byte slice value. If it doesn't exist then it
+will return `nil`. It's important to note that you can have a zero-length value
+set to a key which is different than the key not existing.
+
+Use the `Bucket.Delete()` function to delete a key from the bucket.
+
+Please note that values returned from `Get()` are only valid while the
+transaction is open. If you need to use a value outside of the transaction
+then you must use `copy()` to copy it to another byte slice.
+
+
+### Iterating over keys
+
+Bolt stores its keys in byte-sorted order within a bucket. This makes sequential
+iteration over these keys extremely fast. To iterate over keys we'll use a
+`Cursor`:
+
+```go
+db.View(func(tx *bolt.Tx) error {
+	b := tx.Bucket([]byte("MyBucket"))
+	c := b.Cursor()
+
+	for k, v := c.First(); k != nil; k, v = c.Next() {
+		fmt.Printf("key=%s, value=%s\n", k, v)
+	}
+
+	return nil
+})
+```
+
+The cursor allows you to move to a specific point in the list of keys and move
+forward or backward through the keys one at a time.
+
+The following functions are available on the cursor:
+
+```
+First()  Move to the first key.
+Last()   Move to the last key.
+Seek()   Move to a specific key.
+Next()   Move to the next key.
+Prev()   Move to the previous key.
+```
+
+When you have iterated to the end of the cursor then `Next()` will return `nil`.
+You must seek to a position using `First()`, `Last()`, or `Seek()` before
+calling `Next()` or `Prev()`. If you do not seek to a position then these
+functions will return `nil`.
+
+
+#### Prefix scans
+
+To iterate over a key prefix, you can combine `Seek()` and `bytes.HasPrefix()`:
+
+```go
+db.View(func(tx *bolt.Tx) error {
+	c := tx.Bucket([]byte("MyBucket")).Cursor()
+
+	prefix := []byte("1234")
+	for k, v := c.Seek(prefix); bytes.HasPrefix(k, prefix); k, v = c.Next() {
+		fmt.Printf("key=%s, value=%s\n", k, v)
+	}
+
+	return nil
+})
+```
+
+#### Range scans
+
+Another common use case is scanning over a range such as a time range. If you
+use a sortable time encoding such as RFC3339 then you can query a specific
+date range like this:
+
+```go
+db.View(func(tx *bolt.Tx) error {
+	// Assume our events bucket has RFC3339 encoded time keys.
+	c := tx.Bucket([]byte("Events")).Cursor()
+
+	// Our time range spans the 90's decade.
+	min := []byte("1990-01-01T00:00:00Z")
+	max := []byte("2000-01-01T00:00:00Z")
+
+	// Iterate over the 90's.
+	for k, v := c.Seek(min); k != nil && bytes.Compare(k, max) <= 0; k, v = c.Next() {
+		fmt.Printf("%s: %s\n", k, v)
+	}
+
+	return nil
+})
+```
+
+
+#### ForEach()
+
+You can also use the function `ForEach()` if you know you'll be iterating over
+all the keys in a bucket:
+
+```go
+db.View(func(tx *bolt.Tx) error {
+	b := tx.Bucket([]byte("MyBucket"))
+	b.ForEach(func(k, v []byte) error {
+		fmt.Printf("key=%s, value=%s\n", k, v)
+		return nil
+	})
+	return nil
+})
+```
+
+
+### Nested buckets
+
+You can also store a bucket in a key to create nested buckets. The API is the
+same as the bucket management API on the `DB` object:
+
+```go
+func (*Bucket) CreateBucket(key []byte) (*Bucket, error)
+func (*Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error)
+func (*Bucket) DeleteBucket(key []byte) error
+```
+
+
+### Database backups
+
+Bolt is a single file so it's easy to backup. You can use the `Tx.WriteTo()`
+function to write a consistent view of the database to a writer. If you call
+this from a read-only transaction, it will perform a hot backup and not block
+your other database reads and writes. It will also use `O_DIRECT` when available
+to prevent page cache trashing.
+
+One common use case is to backup over HTTP so you can use tools like `cURL` to
+do database backups:
+
+```go
+func BackupHandleFunc(w http.ResponseWriter, req *http.Request) {
+	err := db.View(func(tx *bolt.Tx) error {
+		w.Header().Set("Content-Type", "application/octet-stream")
+		w.Header().Set("Content-Disposition", `attachment; filename="my.db"`)
+		w.Header().Set("Content-Length", strconv.Itoa(int(tx.Size())))
+		_, err := tx.WriteTo(w)
+		return err
+	})
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+	}
+}
+```
+
+Then you can backup using this command:
+
+```sh
+$ curl http://localhost/backup > my.db
+```
+
+Or you can open your browser to `http://localhost/backup` and it will download
+automatically.
+
+If you want to backup to another file you can use the `Tx.CopyFile()` helper
+function.
+
+
+### Statistics
+
+The database keeps a running count of many of the internal operations it
+performs so you can better understand what's going on. By grabbing a snapshot
+of these stats at two points in time we can see what operations were performed
+in that time range.
+
+For example, we could start a goroutine to log stats every 10 seconds:
+
+```go
+go func() {
+	// Grab the initial stats.
+	prev := db.Stats()
+
+	for {
+		// Wait for 10s.
+		time.Sleep(10 * time.Second)
+
+		// Grab the current stats and diff them.
+		stats := db.Stats()
+		diff := stats.Sub(&prev)
+
+		// Encode stats to JSON and print to STDERR.
+		json.NewEncoder(os.Stderr).Encode(diff)
+
+		// Save stats for the next loop.
+		prev = stats
+	}
+}()
+```
+
+It's also useful to pipe these stats to a service such as statsd for monitoring
+or to provide an HTTP endpoint that will perform a fixed-length sample.
+
+
+## Resources
+
+For more information on getting started with Bolt, check out the following articles:
+
+* [Intro to BoltDB: Painless Performant Persistence](http://npf.io/2014/07/intro-to-boltdb-painless-performant-persistence/) by [Nate Finch](https://github.com/natefinch).
+* [Bolt -- an embedded key/value database for Go](https://www.progville.com/go/bolt-embedded-db-golang/) by Progville
+
+
+## Comparison with other databases
+
+### Postgres, MySQL, & other relational databases
+
+Relational databases structure data into rows and are only accessible through
+the use of SQL. This approach provides flexibility in how you store and query
+your data but also incurs overhead in parsing and planning SQL statements. Bolt
+accesses all data by a byte slice key. This makes Bolt fast to read and write
+data by key but provides no built-in support for joining values together.
+
+Most relational databases (with the exception of SQLite) are standalone servers
+that run separately from your application. This gives your systems
+flexibility to connect multiple application servers to a single database
+server but also adds overhead in serializing and transporting data over the
+network. Bolt runs as a library included in your application so all data access
+has to go through your application's process. This brings data closer to your
+application but limits multi-process access to the data.
+
+
+### LevelDB, RocksDB
+
+LevelDB and its derivatives (RocksDB, HyperLevelDB) are similar to Bolt in that
+they are libraries bundled into the application, however, their underlying
+structure is a log-structured merge-tree (LSM tree). An LSM tree optimizes
+random writes by using a write ahead log and multi-tiered, sorted files called
+SSTables. Bolt uses a B+tree internally and only a single file. Both approaches
+have trade offs.
+
+If you require a high random write throughput (>10,000 w/sec) or you need to use
+spinning disks then LevelDB could be a good choice. If your application is
+read-heavy or does a lot of range scans then Bolt could be a good choice.
+
+One other important consideration is that LevelDB does not have transactions.
+It supports batch writing of key/values pairs and it supports read snapshots
+but it will not give you the ability to do a compare-and-swap operation safely.
+Bolt supports fully serializable ACID transactions.
+
+
+### LMDB
+
+Bolt was originally a port of LMDB so it is architecturally similar. Both use
+a B+tree, have ACID semantics with fully serializable transactions, and support
+lock-free MVCC using a single writer and multiple readers.
+
+The two projects have somewhat diverged. LMDB heavily focuses on raw performance
+while Bolt has focused on simplicity and ease of use. For example, LMDB allows
+several unsafe actions such as direct writes for the sake of performance. Bolt
+opts to disallow actions which can leave the database in a corrupted state. The
+only exception to this in Bolt is `DB.NoSync`.
+
+There are also a few differences in API. LMDB requires a maximum mmap size when
+opening an `mdb_env` whereas Bolt will handle incremental mmap resizing
+automatically. LMDB overloads the getter and setter functions with multiple
+flags whereas Bolt splits these specialized cases into their own functions.
+
+
+## Caveats & Limitations
+
+It's important to pick the right tool for the job and Bolt is no exception.
+Here are a few things to note when evaluating and using Bolt:
+
+* Bolt is good for read intensive workloads. Sequential write performance is
+  also fast but random writes can be slow. You can add a write-ahead log or
+  [transaction coalescer](https://github.com/boltdb/coalescer) in front of Bolt
+  to mitigate this issue.
+
+* Bolt uses a B+tree internally so there can be a lot of random page access.
+  SSDs provide a significant performance boost over spinning disks.
+
+* Try to avoid long running read transactions. Bolt uses copy-on-write so
+  old pages cannot be reclaimed while an old transaction is using them.
+
+* Byte slices returned from Bolt are only valid during a transaction. Once the
+  transaction has been committed or rolled back then the memory they point to
+  can be reused by a new page or can be unmapped from virtual memory and you'll
+  see an `unexpected fault address` panic when accessing it.
+
+* Be careful when using `Bucket.FillPercent`. Setting a high fill percent for
+  buckets that have random inserts will cause your database to have very poor
+  page utilization.
+
+* Use larger buckets in general. Smaller buckets causes poor page utilization
+  once they become larger than the page size (typically 4KB).
+
+* Bulk loading a lot of random writes into a new bucket can be slow as the
+  page will not split until the transaction is committed. Randomly inserting
+  more than 100,000 key/value pairs into a single new bucket in a single
+  transaction is not advised.
+
+* Bolt uses a memory-mapped file so the underlying operating system handles the
+  caching of the data. Typically, the OS will cache as much of the file as it
+  can in memory and will release memory as needed to other processes. This means
+  that Bolt can show very high memory usage when working with large databases.
+  However, this is expected and the OS will release memory as needed. Bolt can
+  handle databases much larger than the available physical RAM.
+
+* Because of the way pages are laid out on disk, Bolt cannot truncate data files
+  and return free pages back to the disk. Instead, Bolt maintains a free list
+  of unused pages within its data file. These free pages can be reused by later
+  transactions. This works well for many use cases as databases generally tend
+  to grow. However, it's important to note that deleting large chunks of data
+  will not allow you to reclaim that space on disk.
+
+  For more information on page allocation, [see this comment][page-allocation].
+
+[page-allocation]: https://github.com/boltdb/bolt/issues/308#issuecomment-74811638
+
+
+## Other Projects Using Bolt
+
+Below is a list of public, open source projects that use Bolt:
+
+* [Operation Go: A Routine Mission](http://gocode.io) - An online programming game for Golang using Bolt for user accounts and a leaderboard.
+* [Bazil](https://github.com/bazillion/bazil) - A file system that lets your data reside where it is most convenient for it to reside.
+* [DVID](https://github.com/janelia-flyem/dvid) - Added Bolt as optional storage engine and testing it against Basho-tuned leveldb.
+* [Skybox Analytics](https://github.com/skybox/skybox) - A standalone funnel analysis tool for web analytics.
+* [Scuttlebutt](https://github.com/benbjohnson/scuttlebutt) - Uses Bolt to store and process all Twitter mentions of GitHub projects.
+* [Wiki](https://github.com/peterhellberg/wiki) - A tiny wiki using Goji, BoltDB and Blackfriday.
+* [ChainStore](https://github.com/nulayer/chainstore) - Simple key-value interface to a variety of storage engines organized as a chain of operations.
+* [MetricBase](https://github.com/msiebuhr/MetricBase) - Single-binary version of Graphite.
+* [Gitchain](https://github.com/gitchain/gitchain) - Decentralized, peer-to-peer Git repositories aka "Git meets Bitcoin".
+* [event-shuttle](https://github.com/sclasen/event-shuttle) - A Unix system service to collect and reliably deliver messages to Kafka.
+* [ipxed](https://github.com/kelseyhightower/ipxed) - Web interface and api for ipxed.
+* [BoltStore](https://github.com/yosssi/boltstore) - Session store using Bolt.
+* [photosite/session](http://godoc.org/bitbucket.org/kardianos/photosite/session) - Sessions for a photo viewing site.
+* [LedisDB](https://github.com/siddontang/ledisdb) - A high performance NoSQL, using Bolt as optional storage.
+* [ipLocator](https://github.com/AndreasBriese/ipLocator) - A fast ip-geo-location-server using bolt with bloom filters.
+* [cayley](https://github.com/google/cayley) - Cayley is an open-source graph database using Bolt as optional backend.
+* [bleve](http://www.blevesearch.com/) - A pure Go search engine similar to ElasticSearch that uses Bolt as the default storage backend.
+* [tentacool](https://github.com/optiflows/tentacool) - REST api server to manage system stuff (IP, DNS, Gateway...) on a linux server.
+* [SkyDB](https://github.com/skydb/sky) - Behavioral analytics database.
+* [Seaweed File System](https://github.com/chrislusf/weed-fs) - Highly scalable distributed key~file system with O(1) disk read.
+* [InfluxDB](http://influxdb.com) - Scalable datastore for metrics, events, and real-time analytics.
+
+If you are using Bolt in a project please send a pull request to add it to the list.

+ 135 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/batch.go

@@ -0,0 +1,135 @@
+package bolt
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+	"time"
+)
+
+// Batch calls fn as part of a batch. It behaves similar to Update,
+// except:
+//
+// 1. concurrent Batch calls can be combined into a single Bolt
+// transaction.
+//
+// 2. the function passed to Batch may be called multiple times,
+// regardless of whether it returns error or not.
+//
+// This means that Batch function side effects must be idempotent and
+// take permanent effect only after a successful return is seen in
+// caller.
+//
+// Batch is only useful when there are multiple goroutines calling it.
+func (db *DB) Batch(fn func(*Tx) error) error {
+	errCh := make(chan error, 1)
+
+	db.batchMu.Lock()
+	if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
+		// There is no existing batch, or the existing batch is full; start a new one.
+		db.batch = &batch{
+			db: db,
+		}
+		db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
+	}
+	db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
+	if len(db.batch.calls) >= db.MaxBatchSize {
+		// wake up batch, it's ready to run
+		go db.batch.trigger()
+	}
+	db.batchMu.Unlock()
+
+	err := <-errCh
+	if err == trySolo {
+		err = db.Update(fn)
+	}
+	return err
+}
+
+type call struct {
+	fn  func(*Tx) error
+	err chan<- error
+}
+
+type batch struct {
+	db    *DB
+	timer *time.Timer
+	start sync.Once
+	calls []call
+}
+
+// trigger runs the batch if it hasn't already been run.
+func (b *batch) trigger() {
+	b.start.Do(b.run)
+}
+
+// run performs the transactions in the batch and communicates results
+// back to DB.Batch.
+func (b *batch) run() {
+	b.db.batchMu.Lock()
+	b.timer.Stop()
+	// Make sure no new work is added to this batch, but don't break
+	// other batches.
+	if b.db.batch == b {
+		b.db.batch = nil
+	}
+	b.db.batchMu.Unlock()
+
+retry:
+	for len(b.calls) > 0 {
+		var failIdx = -1
+		err := b.db.Update(func(tx *Tx) error {
+			for i, c := range b.calls {
+				if err := safelyCall(c.fn, tx); err != nil {
+					failIdx = i
+					return err
+				}
+			}
+			return nil
+		})
+
+		if failIdx >= 0 {
+			// take the failing transaction out of the batch. it's
+			// safe to shorten b.calls here because db.batch no longer
+			// points to us, and we hold the mutex anyway.
+			c := b.calls[failIdx]
+			b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
+			// tell the submitter re-run it solo, continue with the rest of the batch
+			c.err <- trySolo
+			continue retry
+		}
+
+		// pass success, or bolt internal errors, to all callers
+		for _, c := range b.calls {
+			if c.err != nil {
+				c.err <- err
+			}
+		}
+		break retry
+	}
+}
+
+// trySolo is a special sentinel error value used for signaling that a
+// transaction function should be re-run. It should never be seen by
+// callers.
+var trySolo = errors.New("batch function returned an error and should be re-run solo")
+
+type panicked struct {
+	reason interface{}
+}
+
+func (p panicked) Error() string {
+	if err, ok := p.reason.(error); ok {
+		return err.Error()
+	}
+	return fmt.Sprintf("panic: %v", p.reason)
+}
+
+func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
+	defer func() {
+		if p := recover(); p != nil {
+			err = panicked{p}
+		}
+	}()
+	return fn(tx)
+}

+ 170 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/batch_benchmark_test.go

@@ -0,0 +1,170 @@
+package bolt_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"hash/fnv"
+	"sync"
+	"testing"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+func validateBatchBench(b *testing.B, db *TestDB) {
+	var rollback = errors.New("sentinel error to cause rollback")
+	validate := func(tx *bolt.Tx) error {
+		bucket := tx.Bucket([]byte("bench"))
+		h := fnv.New32a()
+		buf := make([]byte, 4)
+		for id := uint32(0); id < 1000; id++ {
+			binary.LittleEndian.PutUint32(buf, id)
+			h.Reset()
+			h.Write(buf[:])
+			k := h.Sum(nil)
+			v := bucket.Get(k)
+			if v == nil {
+				b.Errorf("not found id=%d key=%x", id, k)
+				continue
+			}
+			if g, e := v, []byte("filler"); !bytes.Equal(g, e) {
+				b.Errorf("bad value for id=%d key=%x: %s != %q", id, k, g, e)
+			}
+			if err := bucket.Delete(k); err != nil {
+				return err
+			}
+		}
+		// should be empty now
+		c := bucket.Cursor()
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			b.Errorf("unexpected key: %x = %q", k, v)
+		}
+		return rollback
+	}
+	if err := db.Update(validate); err != nil && err != rollback {
+		b.Error(err)
+	}
+}
+
+func BenchmarkDBBatchAutomatic(b *testing.B) {
+	db := NewTestDB()
+	defer db.Close()
+	db.MustCreateBucket([]byte("bench"))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		start := make(chan struct{})
+		var wg sync.WaitGroup
+
+		for round := 0; round < 1000; round++ {
+			wg.Add(1)
+
+			go func(id uint32) {
+				defer wg.Done()
+				<-start
+
+				h := fnv.New32a()
+				buf := make([]byte, 4)
+				binary.LittleEndian.PutUint32(buf, id)
+				h.Write(buf[:])
+				k := h.Sum(nil)
+				insert := func(tx *bolt.Tx) error {
+					b := tx.Bucket([]byte("bench"))
+					return b.Put(k, []byte("filler"))
+				}
+				if err := db.Batch(insert); err != nil {
+					b.Error(err)
+					return
+				}
+			}(uint32(round))
+		}
+		close(start)
+		wg.Wait()
+	}
+
+	b.StopTimer()
+	validateBatchBench(b, db)
+}
+
+func BenchmarkDBBatchSingle(b *testing.B) {
+	db := NewTestDB()
+	defer db.Close()
+	db.MustCreateBucket([]byte("bench"))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		start := make(chan struct{})
+		var wg sync.WaitGroup
+
+		for round := 0; round < 1000; round++ {
+			wg.Add(1)
+			go func(id uint32) {
+				defer wg.Done()
+				<-start
+
+				h := fnv.New32a()
+				buf := make([]byte, 4)
+				binary.LittleEndian.PutUint32(buf, id)
+				h.Write(buf[:])
+				k := h.Sum(nil)
+				insert := func(tx *bolt.Tx) error {
+					b := tx.Bucket([]byte("bench"))
+					return b.Put(k, []byte("filler"))
+				}
+				if err := db.Update(insert); err != nil {
+					b.Error(err)
+					return
+				}
+			}(uint32(round))
+		}
+		close(start)
+		wg.Wait()
+	}
+
+	b.StopTimer()
+	validateBatchBench(b, db)
+}
+
+func BenchmarkDBBatchManual10x100(b *testing.B) {
+	db := NewTestDB()
+	defer db.Close()
+	db.MustCreateBucket([]byte("bench"))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		start := make(chan struct{})
+		var wg sync.WaitGroup
+
+		for major := 0; major < 10; major++ {
+			wg.Add(1)
+			go func(id uint32) {
+				defer wg.Done()
+				<-start
+
+				insert100 := func(tx *bolt.Tx) error {
+					h := fnv.New32a()
+					buf := make([]byte, 4)
+					for minor := uint32(0); minor < 100; minor++ {
+						binary.LittleEndian.PutUint32(buf, uint32(id*100+minor))
+						h.Reset()
+						h.Write(buf[:])
+						k := h.Sum(nil)
+						b := tx.Bucket([]byte("bench"))
+						if err := b.Put(k, []byte("filler")); err != nil {
+							return err
+						}
+					}
+					return nil
+				}
+				if err := db.Update(insert100); err != nil {
+					b.Fatal(err)
+				}
+			}(uint32(major))
+		}
+		close(start)
+		wg.Wait()
+	}
+
+	b.StopTimer()
+	validateBatchBench(b, db)
+}

+ 148 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/batch_example_test.go

@@ -0,0 +1,148 @@
+package bolt_test
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net/http"
+	"net/http/httptest"
+	"os"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+// Set this to see how the counts are actually updated.
+const verbose = false
+
+// Counter updates a counter in Bolt for every URL path requested.
+type counter struct {
+	db *bolt.DB
+}
+
+func (c counter) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
+	// Communicates the new count from a successful database
+	// transaction.
+	var result uint64
+
+	increment := func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucketIfNotExists([]byte("hits"))
+		if err != nil {
+			return err
+		}
+		key := []byte(req.URL.String())
+		// Decode handles key not found for us.
+		count := decode(b.Get(key)) + 1
+		b.Put(key, encode(count))
+		// All good, communicate new count.
+		result = count
+		return nil
+	}
+	if err := c.db.Batch(increment); err != nil {
+		http.Error(rw, err.Error(), 500)
+		return
+	}
+
+	if verbose {
+		log.Printf("server: %s: %d", req.URL.String(), result)
+	}
+
+	rw.Header().Set("Content-Type", "application/octet-stream")
+	fmt.Fprintf(rw, "%d\n", result)
+}
+
+func client(id int, base string, paths []string) error {
+	// Process paths in random order.
+	rng := rand.New(rand.NewSource(int64(id)))
+	permutation := rng.Perm(len(paths))
+
+	for i := range paths {
+		path := paths[permutation[i]]
+		resp, err := http.Get(base + path)
+		if err != nil {
+			return err
+		}
+		defer resp.Body.Close()
+		buf, err := ioutil.ReadAll(resp.Body)
+		if err != nil {
+			return err
+		}
+		if verbose {
+			log.Printf("client: %s: %s", path, buf)
+		}
+	}
+	return nil
+}
+
+func ExampleDB_Batch() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Start our web server
+	count := counter{db}
+	srv := httptest.NewServer(count)
+	defer srv.Close()
+
+	// Decrease the batch size to make things more interesting.
+	db.MaxBatchSize = 3
+
+	// Get every path multiple times concurrently.
+	const clients = 10
+	paths := []string{
+		"/foo",
+		"/bar",
+		"/baz",
+		"/quux",
+		"/thud",
+		"/xyzzy",
+	}
+	errors := make(chan error, clients)
+	for i := 0; i < clients; i++ {
+		go func(id int) {
+			errors <- client(id, srv.URL, paths)
+		}(i)
+	}
+	// Check all responses to make sure there's no error.
+	for i := 0; i < clients; i++ {
+		if err := <-errors; err != nil {
+			fmt.Printf("client error: %v", err)
+			return
+		}
+	}
+
+	// Check the final result
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("hits"))
+		c := b.Cursor()
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			fmt.Printf("hits to %s: %d\n", k, decode(v))
+		}
+		return nil
+	})
+
+	// Output:
+	// hits to /bar: 10
+	// hits to /baz: 10
+	// hits to /foo: 10
+	// hits to /quux: 10
+	// hits to /thud: 10
+	// hits to /xyzzy: 10
+}
+
+// encode marshals a counter.
+func encode(n uint64) []byte {
+	buf := make([]byte, 8)
+	binary.BigEndian.PutUint64(buf, n)
+	return buf
+}
+
+// decode unmarshals a counter. Nil buffers are decoded as 0.
+func decode(buf []byte) uint64 {
+	if buf == nil {
+		return 0
+	}
+	return binary.BigEndian.Uint64(buf)
+}

+ 167 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/batch_test.go

@@ -0,0 +1,167 @@
+package bolt_test
+
+import (
+	"testing"
+	"time"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+// Ensure two functions can perform updates in a single batch.
+func TestDB_Batch(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.MustCreateBucket([]byte("widgets"))
+
+	// Iterate over multiple updates in separate goroutines.
+	n := 2
+	ch := make(chan error)
+	for i := 0; i < n; i++ {
+		go func(i int) {
+			ch <- db.Batch(func(tx *bolt.Tx) error {
+				return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{})
+			})
+		}(i)
+	}
+
+	// Check all responses to make sure there's no error.
+	for i := 0; i < n; i++ {
+		if err := <-ch; err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	// Ensure data is correct.
+	db.MustView(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		for i := 0; i < n; i++ {
+			if v := b.Get(u64tob(uint64(i))); v == nil {
+				t.Errorf("key not found: %d", i)
+			}
+		}
+		return nil
+	})
+}
+
+func TestDB_Batch_Panic(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	var sentinel int
+	var bork = &sentinel
+	var problem interface{}
+	var err error
+
+	// Execute a function inside a batch that panics.
+	func() {
+		defer func() {
+			if p := recover(); p != nil {
+				problem = p
+			}
+		}()
+		err = db.Batch(func(tx *bolt.Tx) error {
+			panic(bork)
+		})
+	}()
+
+	// Verify there is no error.
+	if g, e := err, error(nil); g != e {
+		t.Fatalf("wrong error: %v != %v", g, e)
+	}
+	// Verify the panic was captured.
+	if g, e := problem, bork; g != e {
+		t.Fatalf("wrong error: %v != %v", g, e)
+	}
+}
+
+func TestDB_BatchFull(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.MustCreateBucket([]byte("widgets"))
+
+	const size = 3
+	// buffered so we never leak goroutines
+	ch := make(chan error, size)
+	put := func(i int) {
+		ch <- db.Batch(func(tx *bolt.Tx) error {
+			return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{})
+		})
+	}
+
+	db.MaxBatchSize = size
+	// high enough to never trigger here
+	db.MaxBatchDelay = 1 * time.Hour
+
+	go put(1)
+	go put(2)
+
+	// Give the batch a chance to exhibit bugs.
+	time.Sleep(10 * time.Millisecond)
+
+	// not triggered yet
+	select {
+	case <-ch:
+		t.Fatalf("batch triggered too early")
+	default:
+	}
+
+	go put(3)
+
+	// Check all responses to make sure there's no error.
+	for i := 0; i < size; i++ {
+		if err := <-ch; err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	// Ensure data is correct.
+	db.MustView(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		for i := 1; i <= size; i++ {
+			if v := b.Get(u64tob(uint64(i))); v == nil {
+				t.Errorf("key not found: %d", i)
+			}
+		}
+		return nil
+	})
+}
+
+func TestDB_BatchTime(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.MustCreateBucket([]byte("widgets"))
+
+	const size = 1
+	// buffered so we never leak goroutines
+	ch := make(chan error, size)
+	put := func(i int) {
+		ch <- db.Batch(func(tx *bolt.Tx) error {
+			return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{})
+		})
+	}
+
+	db.MaxBatchSize = 1000
+	db.MaxBatchDelay = 0
+
+	go put(1)
+
+	// Batch must trigger by time alone.
+
+	// Check all responses to make sure there's no error.
+	for i := 0; i < size; i++ {
+		if err := <-ch; err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	// Ensure data is correct.
+	db.MustView(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		for i := 1; i <= size; i++ {
+			if v := b.Get(u64tob(uint64(i))); v == nil {
+				t.Errorf("key not found: %d", i)
+			}
+		}
+		return nil
+	})
+}

+ 7 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bolt_386.go

@@ -0,0 +1,7 @@
+package bolt
+
+// maxMapSize represents the largest mmap size supported by Bolt.
+const maxMapSize = 0x7FFFFFFF // 2GB
+
+// maxAllocSize is the size used when creating array pointers.
+const maxAllocSize = 0xFFFFFFF

+ 7 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bolt_amd64.go

@@ -0,0 +1,7 @@
+package bolt
+
+// maxMapSize represents the largest mmap size supported by Bolt.
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
+
+// maxAllocSize is the size used when creating array pointers.
+const maxAllocSize = 0x7FFFFFFF

+ 7 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bolt_arm.go

@@ -0,0 +1,7 @@
+package bolt
+
+// maxMapSize represents the largest mmap size supported by Bolt.
+const maxMapSize = 0x7FFFFFFF // 2GB
+
+// maxAllocSize is the size used when creating array pointers.
+const maxAllocSize = 0xFFFFFFF

+ 12 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go

@@ -0,0 +1,12 @@
+package bolt
+
+import (
+	"syscall"
+)
+
+var odirect = syscall.O_DIRECT
+
+// fdatasync flushes written data to a file descriptor.
+func fdatasync(db *DB) error {
+	return syscall.Fdatasync(int(db.file.Fd()))
+}

+ 29 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go

@@ -0,0 +1,29 @@
+package bolt
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+const (
+	msAsync      = 1 << iota // perform asynchronous writes
+	msSync                   // perform synchronous writes
+	msInvalidate             // invalidate cached data
+)
+
+var odirect int
+
+func msync(db *DB) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func fdatasync(db *DB) error {
+	if db.data != nil {
+		return msync(db)
+	}
+	return db.file.Sync()
+}

+ 36 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bolt_test.go

@@ -0,0 +1,36 @@
+package bolt_test
+
+import (
+	"fmt"
+	"path/filepath"
+	"reflect"
+	"runtime"
+	"testing"
+)
+
+// assert fails the test if the condition is false.
+func assert(tb testing.TB, condition bool, msg string, v ...interface{}) {
+	if !condition {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("\033[31m%s:%d: "+msg+"\033[39m\n\n", append([]interface{}{filepath.Base(file), line}, v...)...)
+		tb.FailNow()
+	}
+}
+
+// ok fails the test if an err is not nil.
+func ok(tb testing.TB, err error) {
+	if err != nil {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("\033[31m%s:%d: unexpected error: %s\033[39m\n\n", filepath.Base(file), line, err.Error())
+		tb.FailNow()
+	}
+}
+
+// equals fails the test if exp is not equal to act.
+func equals(tb testing.TB, exp, act interface{}) {
+	if !reflect.DeepEqual(exp, act) {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("\033[31m%s:%d:\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, exp, act)
+		tb.FailNow()
+	}
+}

+ 80 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bolt_unix.go

@@ -0,0 +1,80 @@
+// +build !windows,!plan9
+
+package bolt
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+	"time"
+	"unsafe"
+)
+
+// flock acquires an advisory lock on a file descriptor.
+func flock(f *os.File, timeout time.Duration) error {
+	var t time.Time
+	for {
+		// If we're beyond our timeout then return an error.
+		// This can only occur after we've attempted a flock once.
+		if t.IsZero() {
+			t = time.Now()
+		} else if timeout > 0 && time.Since(t) > timeout {
+			return ErrTimeout
+		}
+
+		// Otherwise attempt to obtain an exclusive lock.
+		err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
+		if err == nil {
+			return nil
+		} else if err != syscall.EWOULDBLOCK {
+			return err
+		}
+
+		// Wait for a bit and try again.
+		time.Sleep(50 * time.Millisecond)
+	}
+}
+
+// funlock releases an advisory lock on a file descriptor.
+func funlock(f *os.File) error {
+	return syscall.Flock(int(f.Fd()), syscall.LOCK_UN)
+}
+
+// mmap memory maps a DB's data file.
+func mmap(db *DB, sz int) error {
+	// Truncate and fsync to ensure file size metadata is flushed.
+	// https://github.com/boltdb/bolt/issues/284
+	if err := db.file.Truncate(int64(sz)); err != nil {
+		return fmt.Errorf("file resize error: %s", err)
+	}
+	if err := db.file.Sync(); err != nil {
+		return fmt.Errorf("file sync error: %s", err)
+	}
+
+	// Map the data file to memory.
+	b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED)
+	if err != nil {
+		return err
+	}
+
+	// Save the original byte slice and convert to a byte array pointer.
+	db.dataref = b
+	db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0]))
+	db.datasz = sz
+	return nil
+}
+
+// munmap unmaps a DB's data file from memory.
+func munmap(db *DB) error {
+	// Ignore the unmap if we have no mapped data.
+	if db.dataref == nil {
+		return nil
+	}
+
+	// Unmap using the original byte slice.
+	err := syscall.Munmap(db.dataref)
+	db.dataref = nil
+	db.data = nil
+	db.datasz = 0
+	return err
+}

+ 74 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go

@@ -0,0 +1,74 @@
+package bolt
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+	"time"
+	"unsafe"
+)
+
+var odirect int
+
+// fdatasync flushes written data to a file descriptor.
+func fdatasync(db *DB) error {
+	return db.file.Sync()
+}
+
+// flock acquires an advisory lock on a file descriptor.
+func flock(f *os.File, _ time.Duration) error {
+	return nil
+}
+
+// funlock releases an advisory lock on a file descriptor.
+func funlock(f *os.File) error {
+	return nil
+}
+
+// mmap memory maps a DB's data file.
+// Based on: https://github.com/edsrzf/mmap-go
+func mmap(db *DB, sz int) error {
+	// Truncate the database to the size of the mmap.
+	if err := db.file.Truncate(int64(sz)); err != nil {
+		return fmt.Errorf("truncate: %s", err)
+	}
+
+	// Open a file mapping handle.
+	sizelo := uint32(sz >> 32)
+	sizehi := uint32(sz) & 0xffffffff
+	h, errno := syscall.CreateFileMapping(syscall.Handle(db.file.Fd()), nil, syscall.PAGE_READONLY, sizelo, sizehi, nil)
+	if h == 0 {
+		return os.NewSyscallError("CreateFileMapping", errno)
+	}
+
+	// Create the memory map.
+	addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(sz))
+	if addr == 0 {
+		return os.NewSyscallError("MapViewOfFile", errno)
+	}
+
+	// Close mapping handle.
+	if err := syscall.CloseHandle(syscall.Handle(h)); err != nil {
+		return os.NewSyscallError("CloseHandle", err)
+	}
+
+	// Convert to a byte array.
+	db.data = ((*[maxMapSize]byte)(unsafe.Pointer(addr)))
+	db.datasz = sz
+
+	return nil
+}
+
+// munmap unmaps a pointer from a file.
+// Based on: https://github.com/edsrzf/mmap-go
+func munmap(db *DB) error {
+	if db.data == nil {
+		return nil
+	}
+
+	addr := (uintptr)(unsafe.Pointer(&db.data[0]))
+	if err := syscall.UnmapViewOfFile(addr); err != nil {
+		return os.NewSyscallError("UnmapViewOfFile", err)
+	}
+	return nil
+}

+ 10 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go

@@ -0,0 +1,10 @@
+// +build !windows,!plan9,!linux,!openbsd
+
+package bolt
+
+var odirect int
+
+// fdatasync flushes written data to a file descriptor.
+func fdatasync(db *DB) error {
+	return db.file.Sync()
+}

+ 743 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go

@@ -0,0 +1,743 @@
+package bolt
+
+import (
+	"bytes"
+	"fmt"
+	"unsafe"
+)
+
+const (
+	// MaxKeySize is the maximum length of a key, in bytes.
+	MaxKeySize = 32768
+
+	// MaxValueSize is the maximum length of a value, in bytes.
+	MaxValueSize = 4294967295
+)
+
+const (
+	maxUint = ^uint(0)
+	minUint = 0
+	maxInt  = int(^uint(0) >> 1)
+	minInt  = -maxInt - 1
+)
+
+const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
+
+const (
+	minFillPercent = 0.1
+	maxFillPercent = 1.0
+)
+
+// DefaultFillPercent is the percentage that split pages are filled.
+// This value can be changed by setting Bucket.FillPercent.
+const DefaultFillPercent = 0.5
+
+// Bucket represents a collection of key/value pairs inside the database.
+type Bucket struct {
+	*bucket
+	tx       *Tx                // the associated transaction
+	buckets  map[string]*Bucket // subbucket cache
+	page     *page              // inline page reference
+	rootNode *node              // materialized node for the root page.
+	nodes    map[pgid]*node     // node cache
+
+	// Sets the threshold for filling nodes when they split. By default,
+	// the bucket will fill to 50% but it can be useful to increase this
+	// amount if you know that your write workloads are mostly append-only.
+	//
+	// This is non-persisted across transactions so it must be set in every Tx.
+	FillPercent float64
+}
+
+// bucket represents the on-file representation of a bucket.
+// This is stored as the "value" of a bucket key. If the bucket is small enough,
+// then its root page can be stored inline in the "value", after the bucket
+// header. In the case of inline buckets, the "root" will be 0.
+type bucket struct {
+	root     pgid   // page id of the bucket's root-level page
+	sequence uint64 // monotonically incrementing, used by NextSequence()
+}
+
+// newBucket returns a new bucket associated with a transaction.
+func newBucket(tx *Tx) Bucket {
+	var b = Bucket{tx: tx, FillPercent: DefaultFillPercent}
+	if tx.writable {
+		b.buckets = make(map[string]*Bucket)
+		b.nodes = make(map[pgid]*node)
+	}
+	return b
+}
+
+// Tx returns the tx of the bucket.
+func (b *Bucket) Tx() *Tx {
+	return b.tx
+}
+
+// Root returns the root of the bucket.
+func (b *Bucket) Root() pgid {
+	return b.root
+}
+
+// Writable returns whether the bucket is writable.
+func (b *Bucket) Writable() bool {
+	return b.tx.writable
+}
+
+// Cursor creates a cursor associated with the bucket.
+// The cursor is only valid as long as the transaction is open.
+// Do not use a cursor after the transaction is closed.
+func (b *Bucket) Cursor() *Cursor {
+	// Update transaction statistics.
+	b.tx.stats.CursorCount++
+
+	// Allocate and return a cursor.
+	return &Cursor{
+		bucket: b,
+		stack:  make([]elemRef, 0),
+	}
+}
+
+// Bucket retrieves a nested bucket by name.
+// Returns nil if the bucket does not exist.
+func (b *Bucket) Bucket(name []byte) *Bucket {
+	if b.buckets != nil {
+		if child := b.buckets[string(name)]; child != nil {
+			return child
+		}
+	}
+
+	// Move cursor to key.
+	c := b.Cursor()
+	k, v, flags := c.seek(name)
+
+	// Return nil if the key doesn't exist or it is not a bucket.
+	if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
+		return nil
+	}
+
+	// Otherwise create a bucket and cache it.
+	var child = b.openBucket(v)
+	if b.buckets != nil {
+		b.buckets[string(name)] = child
+	}
+
+	return child
+}
+
+// Helper method that re-interprets a sub-bucket value
+// from a parent into a Bucket
+func (b *Bucket) openBucket(value []byte) *Bucket {
+	var child = newBucket(b.tx)
+
+	// If this is a writable transaction then we need to copy the bucket entry.
+	// Read-only transactions can point directly at the mmap entry.
+	if b.tx.writable {
+		child.bucket = &bucket{}
+		*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
+	} else {
+		child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
+	}
+
+	// Save a reference to the inline page if the bucket is inline.
+	if child.root == 0 {
+		child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
+	}
+
+	return &child
+}
+
+// CreateBucket creates a new bucket at the given key and returns the new bucket.
+// Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long.
+func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
+	if b.tx.db == nil {
+		return nil, ErrTxClosed
+	} else if !b.tx.writable {
+		return nil, ErrTxNotWritable
+	} else if len(key) == 0 {
+		return nil, ErrBucketNameRequired
+	}
+
+	// Move cursor to correct position.
+	c := b.Cursor()
+	k, _, flags := c.seek(key)
+
+	// Return an error if there is an existing key.
+	if bytes.Equal(key, k) {
+		if (flags & bucketLeafFlag) != 0 {
+			return nil, ErrBucketExists
+		} else {
+			return nil, ErrIncompatibleValue
+		}
+	}
+
+	// Create empty, inline bucket.
+	var bucket = Bucket{
+		bucket:      &bucket{},
+		rootNode:    &node{isLeaf: true},
+		FillPercent: DefaultFillPercent,
+	}
+	var value = bucket.write()
+
+	// Insert into node.
+	key = cloneBytes(key)
+	c.node().put(key, key, value, 0, bucketLeafFlag)
+
+	// Since subbuckets are not allowed on inline buckets, we need to
+	// dereference the inline page, if it exists. This will cause the bucket
+	// to be treated as a regular, non-inline bucket for the rest of the tx.
+	b.page = nil
+
+	return b.Bucket(key), nil
+}
+
+// CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it.
+// Returns an error if the bucket name is blank, or if the bucket name is too long.
+func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
+	child, err := b.CreateBucket(key)
+	if err == ErrBucketExists {
+		return b.Bucket(key), nil
+	} else if err != nil {
+		return nil, err
+	}
+	return child, nil
+}
+
+// DeleteBucket deletes a bucket at the given key.
+// Returns an error if the bucket does not exists, or if the key represents a non-bucket value.
+func (b *Bucket) DeleteBucket(key []byte) error {
+	if b.tx.db == nil {
+		return ErrTxClosed
+	} else if !b.Writable() {
+		return ErrTxNotWritable
+	}
+
+	// Move cursor to correct position.
+	c := b.Cursor()
+	k, _, flags := c.seek(key)
+
+	// Return an error if bucket doesn't exist or is not a bucket.
+	if !bytes.Equal(key, k) {
+		return ErrBucketNotFound
+	} else if (flags & bucketLeafFlag) == 0 {
+		return ErrIncompatibleValue
+	}
+
+	// Recursively delete all child buckets.
+	child := b.Bucket(key)
+	err := child.ForEach(func(k, v []byte) error {
+		if v == nil {
+			if err := child.DeleteBucket(k); err != nil {
+				return fmt.Errorf("delete bucket: %s", err)
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	// Remove cached copy.
+	delete(b.buckets, string(key))
+
+	// Release all bucket pages to freelist.
+	child.nodes = nil
+	child.rootNode = nil
+	child.free()
+
+	// Delete the node if we have a matching key.
+	c.node().del(key)
+
+	return nil
+}
+
+// Get retrieves the value for a key in the bucket.
+// Returns a nil value if the key does not exist or if the key is a nested bucket.
+// The returned value is only valid for the life of the transaction.
+func (b *Bucket) Get(key []byte) []byte {
+	k, v, flags := b.Cursor().seek(key)
+
+	// Return nil if this is a bucket.
+	if (flags & bucketLeafFlag) != 0 {
+		return nil
+	}
+
+	// If our target node isn't the same key as what's passed in then return nil.
+	if !bytes.Equal(key, k) {
+		return nil
+	}
+	return v
+}
+
+// Put sets the value for a key in the bucket.
+// If the key exist then its previous value will be overwritten.
+// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.
+func (b *Bucket) Put(key []byte, value []byte) error {
+	if b.tx.db == nil {
+		return ErrTxClosed
+	} else if !b.Writable() {
+		return ErrTxNotWritable
+	} else if len(key) == 0 {
+		return ErrKeyRequired
+	} else if len(key) > MaxKeySize {
+		return ErrKeyTooLarge
+	} else if int64(len(value)) > MaxValueSize {
+		return ErrValueTooLarge
+	}
+
+	// Move cursor to correct position.
+	c := b.Cursor()
+	k, _, flags := c.seek(key)
+
+	// Return an error if there is an existing key with a bucket value.
+	if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 {
+		return ErrIncompatibleValue
+	}
+
+	// Insert into node.
+	key = cloneBytes(key)
+	c.node().put(key, key, value, 0, 0)
+
+	return nil
+}
+
+// Delete removes a key from the bucket.
+// If the key does not exist then nothing is done and a nil error is returned.
+// Returns an error if the bucket was created from a read-only transaction.
+func (b *Bucket) Delete(key []byte) error {
+	if b.tx.db == nil {
+		return ErrTxClosed
+	} else if !b.Writable() {
+		return ErrTxNotWritable
+	}
+
+	// Move cursor to correct position.
+	c := b.Cursor()
+	_, _, flags := c.seek(key)
+
+	// Return an error if there is already existing bucket value.
+	if (flags & bucketLeafFlag) != 0 {
+		return ErrIncompatibleValue
+	}
+
+	// Delete the node if we have a matching key.
+	c.node().del(key)
+
+	return nil
+}
+
+// NextSequence returns an autoincrementing integer for the bucket.
+func (b *Bucket) NextSequence() (uint64, error) {
+	if b.tx.db == nil {
+		return 0, ErrTxClosed
+	} else if !b.Writable() {
+		return 0, ErrTxNotWritable
+	}
+
+	// Materialize the root node if it hasn't been already so that the
+	// bucket will be saved during commit.
+	if b.rootNode == nil {
+		_ = b.node(b.root, nil)
+	}
+
+	// Increment and return the sequence.
+	b.bucket.sequence++
+	return b.bucket.sequence, nil
+}
+
+// ForEach executes a function for each key/value pair in a bucket.
+// If the provided function returns an error then the iteration is stopped and
+// the error is returned to the caller.
+func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
+	if b.tx.db == nil {
+		return ErrTxClosed
+	}
+	c := b.Cursor()
+	for k, v := c.First(); k != nil; k, v = c.Next() {
+		if err := fn(k, v); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Stat returns stats on a bucket.
+func (b *Bucket) Stats() BucketStats {
+	var s, subStats BucketStats
+	pageSize := b.tx.db.pageSize
+	s.BucketN += 1
+	if b.root == 0 {
+		s.InlineBucketN += 1
+	}
+	b.forEachPage(func(p *page, depth int) {
+		if (p.flags & leafPageFlag) != 0 {
+			s.KeyN += int(p.count)
+
+			// used totals the used bytes for the page
+			used := pageHeaderSize
+
+			if p.count != 0 {
+				// If page has any elements, add all element headers.
+				used += leafPageElementSize * int(p.count-1)
+
+				// Add all element key, value sizes.
+				// The computation takes advantage of the fact that the position
+				// of the last element's key/value equals to the total of the sizes
+				// of all previous elements' keys and values.
+				// It also includes the last element's header.
+				lastElement := p.leafPageElement(p.count - 1)
+				used += int(lastElement.pos + lastElement.ksize + lastElement.vsize)
+			}
+
+			if b.root == 0 {
+				// For inlined bucket just update the inline stats
+				s.InlineBucketInuse += used
+			} else {
+				// For non-inlined bucket update all the leaf stats
+				s.LeafPageN++
+				s.LeafInuse += used
+				s.LeafOverflowN += int(p.overflow)
+
+				// Collect stats from sub-buckets.
+				// Do that by iterating over all element headers
+				// looking for the ones with the bucketLeafFlag.
+				for i := uint16(0); i < p.count; i++ {
+					e := p.leafPageElement(i)
+					if (e.flags & bucketLeafFlag) != 0 {
+						// For any bucket element, open the element value
+						// and recursively call Stats on the contained bucket.
+						subStats.Add(b.openBucket(e.value()).Stats())
+					}
+				}
+			}
+		} else if (p.flags & branchPageFlag) != 0 {
+			s.BranchPageN++
+			lastElement := p.branchPageElement(p.count - 1)
+
+			// used totals the used bytes for the page
+			// Add header and all element headers.
+			used := pageHeaderSize + (branchPageElementSize * int(p.count-1))
+
+			// Add size of all keys and values.
+			// Again, use the fact that last element's position equals to
+			// the total of key, value sizes of all previous elements.
+			used += int(lastElement.pos + lastElement.ksize)
+			s.BranchInuse += used
+			s.BranchOverflowN += int(p.overflow)
+		}
+
+		// Keep track of maximum page depth.
+		if depth+1 > s.Depth {
+			s.Depth = (depth + 1)
+		}
+	})
+
+	// Alloc stats can be computed from page counts and pageSize.
+	s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize
+	s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize
+
+	// Add the max depth of sub-buckets to get total nested depth.
+	s.Depth += subStats.Depth
+	// Add the stats for all sub-buckets
+	s.Add(subStats)
+	return s
+}
+
+// forEachPage iterates over every page in a bucket, including inline pages.
+func (b *Bucket) forEachPage(fn func(*page, int)) {
+	// If we have an inline page then just use that.
+	if b.page != nil {
+		fn(b.page, 0)
+		return
+	}
+
+	// Otherwise traverse the page hierarchy.
+	b.tx.forEachPage(b.root, 0, fn)
+}
+
+// forEachPageNode iterates over every page (or node) in a bucket.
+// This also includes inline pages.
+func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) {
+	// If we have an inline page or root node then just use that.
+	if b.page != nil {
+		fn(b.page, nil, 0)
+		return
+	}
+	b._forEachPageNode(b.root, 0, fn)
+}
+
+func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) {
+	var p, n = b.pageNode(pgid)
+
+	// Execute function.
+	fn(p, n, depth)
+
+	// Recursively loop over children.
+	if p != nil {
+		if (p.flags & branchPageFlag) != 0 {
+			for i := 0; i < int(p.count); i++ {
+				elem := p.branchPageElement(uint16(i))
+				b._forEachPageNode(elem.pgid, depth+1, fn)
+			}
+		}
+	} else {
+		if !n.isLeaf {
+			for _, inode := range n.inodes {
+				b._forEachPageNode(inode.pgid, depth+1, fn)
+			}
+		}
+	}
+}
+
+// spill writes all the nodes for this bucket to dirty pages.
+func (b *Bucket) spill() error {
+	// Spill all child buckets first.
+	for name, child := range b.buckets {
+		// If the child bucket is small enough and it has no child buckets then
+		// write it inline into the parent bucket's page. Otherwise spill it
+		// like a normal bucket and make the parent value a pointer to the page.
+		var value []byte
+		if child.inlineable() {
+			child.free()
+			value = child.write()
+		} else {
+			if err := child.spill(); err != nil {
+				return err
+			}
+
+			// Update the child bucket header in this bucket.
+			value = make([]byte, unsafe.Sizeof(bucket{}))
+			var bucket = (*bucket)(unsafe.Pointer(&value[0]))
+			*bucket = *child.bucket
+		}
+
+		// Skip writing the bucket if there are no materialized nodes.
+		if child.rootNode == nil {
+			continue
+		}
+
+		// Update parent node.
+		var c = b.Cursor()
+		k, _, flags := c.seek([]byte(name))
+		if !bytes.Equal([]byte(name), k) {
+			panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k))
+		}
+		if flags&bucketLeafFlag == 0 {
+			panic(fmt.Sprintf("unexpected bucket header flag: %x", flags))
+		}
+		c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
+	}
+
+	// Ignore if there's not a materialized root node.
+	if b.rootNode == nil {
+		return nil
+	}
+
+	// Spill nodes.
+	if err := b.rootNode.spill(); err != nil {
+		return err
+	}
+	b.rootNode = b.rootNode.root()
+
+	// Update the root node for this bucket.
+	if b.rootNode.pgid >= b.tx.meta.pgid {
+		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid))
+	}
+	b.root = b.rootNode.pgid
+
+	return nil
+}
+
+// inlineable returns true if a bucket is small enough to be written inline
+// and if it contains no subbuckets. Otherwise returns false.
+func (b *Bucket) inlineable() bool {
+	var n = b.rootNode
+
+	// Bucket must only contain a single leaf node.
+	if n == nil || !n.isLeaf {
+		return false
+	}
+
+	// Bucket is not inlineable if it contains subbuckets or if it goes beyond
+	// our threshold for inline bucket size.
+	var size = pageHeaderSize
+	for _, inode := range n.inodes {
+		size += leafPageElementSize + len(inode.key) + len(inode.value)
+
+		if inode.flags&bucketLeafFlag != 0 {
+			return false
+		} else if size > b.maxInlineBucketSize() {
+			return false
+		}
+	}
+
+	return true
+}
+
+// Returns the maximum total size of a bucket to make it a candidate for inlining.
+func (b *Bucket) maxInlineBucketSize() int {
+	return b.tx.db.pageSize / 4
+}
+
+// write allocates and writes a bucket to a byte slice.
+func (b *Bucket) write() []byte {
+	// Allocate the appropriate size.
+	var n = b.rootNode
+	var value = make([]byte, bucketHeaderSize+n.size())
+
+	// Write a bucket header.
+	var bucket = (*bucket)(unsafe.Pointer(&value[0]))
+	*bucket = *b.bucket
+
+	// Convert byte slice to a fake page and write the root node.
+	var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
+	n.write(p)
+
+	return value
+}
+
+// rebalance attempts to balance all nodes.
+func (b *Bucket) rebalance() {
+	for _, n := range b.nodes {
+		n.rebalance()
+	}
+	for _, child := range b.buckets {
+		child.rebalance()
+	}
+}
+
+// node creates a node from a page and associates it with a given parent.
+func (b *Bucket) node(pgid pgid, parent *node) *node {
+	_assert(b.nodes != nil, "nodes map expected")
+
+	// Retrieve node if it's already been created.
+	if n := b.nodes[pgid]; n != nil {
+		return n
+	}
+
+	// Otherwise create a node and cache it.
+	n := &node{bucket: b, parent: parent}
+	if parent == nil {
+		b.rootNode = n
+	} else {
+		parent.children = append(parent.children, n)
+	}
+
+	// Use the inline page if this is an inline bucket.
+	var p = b.page
+	if p == nil {
+		p = b.tx.page(pgid)
+	}
+
+	// Read the page into the node and cache it.
+	n.read(p)
+	b.nodes[pgid] = n
+
+	// Update statistics.
+	b.tx.stats.NodeCount++
+
+	return n
+}
+
+// free recursively frees all pages in the bucket.
+func (b *Bucket) free() {
+	if b.root == 0 {
+		return
+	}
+
+	var tx = b.tx
+	b.forEachPageNode(func(p *page, n *node, _ int) {
+		if p != nil {
+			tx.db.freelist.free(tx.meta.txid, p)
+		} else {
+			n.free()
+		}
+	})
+	b.root = 0
+}
+
+// dereference removes all references to the old mmap.
+func (b *Bucket) dereference() {
+	if b.rootNode != nil {
+		b.rootNode.root().dereference()
+	}
+
+	for _, child := range b.buckets {
+		child.dereference()
+	}
+}
+
+// pageNode returns the in-memory node, if it exists.
+// Otherwise returns the underlying page.
+func (b *Bucket) pageNode(id pgid) (*page, *node) {
+	// Inline buckets have a fake page embedded in their value so treat them
+	// differently. We'll return the rootNode (if available) or the fake page.
+	if b.root == 0 {
+		if id != 0 {
+			panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id))
+		}
+		if b.rootNode != nil {
+			return nil, b.rootNode
+		}
+		return b.page, nil
+	}
+
+	// Check the node cache for non-inline buckets.
+	if b.nodes != nil {
+		if n := b.nodes[id]; n != nil {
+			return nil, n
+		}
+	}
+
+	// Finally lookup the page from the transaction if no node is materialized.
+	return b.tx.page(id), nil
+}
+
+// BucketStats records statistics about resources used by a bucket.
+type BucketStats struct {
+	// Page count statistics.
+	BranchPageN     int // number of logical branch pages
+	BranchOverflowN int // number of physical branch overflow pages
+	LeafPageN       int // number of logical leaf pages
+	LeafOverflowN   int // number of physical leaf overflow pages
+
+	// Tree statistics.
+	KeyN  int // number of keys/value pairs
+	Depth int // number of levels in B+tree
+
+	// Page size utilization.
+	BranchAlloc int // bytes allocated for physical branch pages
+	BranchInuse int // bytes actually used for branch data
+	LeafAlloc   int // bytes allocated for physical leaf pages
+	LeafInuse   int // bytes actually used for leaf data
+
+	// Bucket statistics
+	BucketN           int // total number of buckets including the top bucket
+	InlineBucketN     int // total number on inlined buckets
+	InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse)
+}
+
+func (s *BucketStats) Add(other BucketStats) {
+	s.BranchPageN += other.BranchPageN
+	s.BranchOverflowN += other.BranchOverflowN
+	s.LeafPageN += other.LeafPageN
+	s.LeafOverflowN += other.LeafOverflowN
+	s.KeyN += other.KeyN
+	if s.Depth < other.Depth {
+		s.Depth = other.Depth
+	}
+	s.BranchAlloc += other.BranchAlloc
+	s.BranchInuse += other.BranchInuse
+	s.LeafAlloc += other.LeafAlloc
+	s.LeafInuse += other.LeafInuse
+
+	s.BucketN += other.BucketN
+	s.InlineBucketN += other.InlineBucketN
+	s.InlineBucketInuse += other.InlineBucketInuse
+}
+
+// cloneBytes returns a copy of a given slice.
+func cloneBytes(v []byte) []byte {
+	var clone = make([]byte, len(v))
+	copy(clone, v)
+	return clone
+}

+ 1153 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go

@@ -0,0 +1,1153 @@
+package bolt_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"math/rand"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"testing/quick"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+// Ensure that a bucket that gets a non-existent key returns nil.
+func TestBucket_Get_NonExistent(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+		assert(t, value == nil, "")
+		return nil
+	})
+}
+
+// Ensure that a bucket can read a value that is not flushed yet.
+func TestBucket_Get_FromNode(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+		b.Put([]byte("foo"), []byte("bar"))
+		value := b.Get([]byte("foo"))
+		equals(t, []byte("bar"), value)
+		return nil
+	})
+}
+
+// Ensure that a bucket retrieved via Get() returns a nil.
+func TestBucket_Get_IncompatibleValue(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		_, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo"))
+		ok(t, err)
+		assert(t, tx.Bucket([]byte("widgets")).Get([]byte("foo")) == nil, "")
+		return nil
+	})
+}
+
+// Ensure that a bucket can write a key/value.
+func TestBucket_Put(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		err := tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		ok(t, err)
+		value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+		equals(t, value, []byte("bar"))
+		return nil
+	})
+}
+
+// Ensure that a bucket can rewrite a key in the same transaction.
+func TestBucket_Put_Repeat(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+		ok(t, b.Put([]byte("foo"), []byte("bar")))
+		ok(t, b.Put([]byte("foo"), []byte("baz")))
+		value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+		equals(t, value, []byte("baz"))
+		return nil
+	})
+}
+
+// Ensure that a bucket can write a bunch of large values.
+func TestBucket_Put_Large(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	count, factor := 100, 200
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+		for i := 1; i < count; i++ {
+			ok(t, b.Put([]byte(strings.Repeat("0", i*factor)), []byte(strings.Repeat("X", (count-i)*factor))))
+		}
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		for i := 1; i < count; i++ {
+			value := b.Get([]byte(strings.Repeat("0", i*factor)))
+			equals(t, []byte(strings.Repeat("X", (count-i)*factor)), value)
+		}
+		return nil
+	})
+}
+
+// Ensure that a database can perform multiple large appends safely.
+func TestDB_Put_VeryLarge(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+
+	n, batchN := 400000, 200000
+	ksize, vsize := 8, 500
+
+	db := NewTestDB()
+	defer db.Close()
+
+	for i := 0; i < n; i += batchN {
+		err := db.Update(func(tx *bolt.Tx) error {
+			b, _ := tx.CreateBucketIfNotExists([]byte("widgets"))
+			for j := 0; j < batchN; j++ {
+				k, v := make([]byte, ksize), make([]byte, vsize)
+				binary.BigEndian.PutUint32(k, uint32(i+j))
+				ok(t, b.Put(k, v))
+			}
+			return nil
+		})
+		ok(t, err)
+	}
+}
+
+// Ensure that a setting a value on a key with a bucket value returns an error.
+func TestBucket_Put_IncompatibleValue(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		_, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo"))
+		ok(t, err)
+		equals(t, bolt.ErrIncompatibleValue, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")))
+		return nil
+	})
+}
+
+// Ensure that a setting a value while the transaction is closed returns an error.
+func TestBucket_Put_Closed(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(true)
+	tx.CreateBucket([]byte("widgets"))
+	b := tx.Bucket([]byte("widgets"))
+	tx.Rollback()
+	equals(t, bolt.ErrTxClosed, b.Put([]byte("foo"), []byte("bar")))
+}
+
+// Ensure that setting a value on a read-only bucket returns an error.
+func TestBucket_Put_ReadOnly(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		ok(t, err)
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		err := b.Put([]byte("foo"), []byte("bar"))
+		equals(t, err, bolt.ErrTxNotWritable)
+		return nil
+	})
+}
+
+// Ensure that a bucket can delete an existing key.
+func TestBucket_Delete(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		err := tx.Bucket([]byte("widgets")).Delete([]byte("foo"))
+		ok(t, err)
+		value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+		assert(t, value == nil, "")
+		return nil
+	})
+}
+
+// Ensure that deleting a large set of keys will work correctly.
+func TestBucket_Delete_Large(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		var b, _ = tx.CreateBucket([]byte("widgets"))
+		for i := 0; i < 100; i++ {
+			ok(t, b.Put([]byte(strconv.Itoa(i)), []byte(strings.Repeat("*", 1024))))
+		}
+		return nil
+	})
+	db.Update(func(tx *bolt.Tx) error {
+		var b = tx.Bucket([]byte("widgets"))
+		for i := 0; i < 100; i++ {
+			ok(t, b.Delete([]byte(strconv.Itoa(i))))
+		}
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		var b = tx.Bucket([]byte("widgets"))
+		for i := 0; i < 100; i++ {
+			assert(t, b.Get([]byte(strconv.Itoa(i))) == nil, "")
+		}
+		return nil
+	})
+}
+
+// Deleting a very large list of keys will cause the freelist to use overflow.
+func TestBucket_Delete_FreelistOverflow(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+
+	db := NewTestDB()
+	defer db.Close()
+	k := make([]byte, 16)
+	for i := uint64(0); i < 10000; i++ {
+		err := db.Update(func(tx *bolt.Tx) error {
+			b, err := tx.CreateBucketIfNotExists([]byte("0"))
+			if err != nil {
+				t.Fatalf("bucket error: %s", err)
+			}
+
+			for j := uint64(0); j < 1000; j++ {
+				binary.BigEndian.PutUint64(k[:8], i)
+				binary.BigEndian.PutUint64(k[8:], j)
+				if err := b.Put(k, nil); err != nil {
+					t.Fatalf("put error: %s", err)
+				}
+			}
+
+			return nil
+		})
+
+		if err != nil {
+			t.Fatalf("update error: %s", err)
+		}
+	}
+
+	// Delete all of them in one large transaction
+	err := db.Update(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("0"))
+		c := b.Cursor()
+		for k, _ := c.First(); k != nil; k, _ = c.Next() {
+			b.Delete(k)
+		}
+		return nil
+	})
+
+	// Check that a freelist overflow occurred.
+	ok(t, err)
+}
+
+// Ensure that accessing and updating nested buckets is ok across transactions.
+func TestBucket_Nested(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		// Create a widgets bucket.
+		b, err := tx.CreateBucket([]byte("widgets"))
+		ok(t, err)
+
+		// Create a widgets/foo bucket.
+		_, err = b.CreateBucket([]byte("foo"))
+		ok(t, err)
+
+		// Create a widgets/bar key.
+		ok(t, b.Put([]byte("bar"), []byte("0000")))
+
+		return nil
+	})
+	db.MustCheck()
+
+	// Update widgets/bar.
+	db.Update(func(tx *bolt.Tx) error {
+		var b = tx.Bucket([]byte("widgets"))
+		ok(t, b.Put([]byte("bar"), []byte("xxxx")))
+		return nil
+	})
+	db.MustCheck()
+
+	// Cause a split.
+	db.Update(func(tx *bolt.Tx) error {
+		var b = tx.Bucket([]byte("widgets"))
+		for i := 0; i < 10000; i++ {
+			ok(t, b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))))
+		}
+		return nil
+	})
+	db.MustCheck()
+
+	// Insert into widgets/foo/baz.
+	db.Update(func(tx *bolt.Tx) error {
+		var b = tx.Bucket([]byte("widgets"))
+		ok(t, b.Bucket([]byte("foo")).Put([]byte("baz"), []byte("yyyy")))
+		return nil
+	})
+	db.MustCheck()
+
+	// Verify.
+	db.View(func(tx *bolt.Tx) error {
+		var b = tx.Bucket([]byte("widgets"))
+		equals(t, []byte("yyyy"), b.Bucket([]byte("foo")).Get([]byte("baz")))
+		equals(t, []byte("xxxx"), b.Get([]byte("bar")))
+		for i := 0; i < 10000; i++ {
+			equals(t, []byte(strconv.Itoa(i)), b.Get([]byte(strconv.Itoa(i))))
+		}
+		return nil
+	})
+}
+
+// Ensure that deleting a bucket using Delete() returns an error.
+func TestBucket_Delete_Bucket(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+		_, err := b.CreateBucket([]byte("foo"))
+		ok(t, err)
+		equals(t, bolt.ErrIncompatibleValue, b.Delete([]byte("foo")))
+		return nil
+	})
+}
+
+// Ensure that deleting a key on a read-only bucket returns an error.
+func TestBucket_Delete_ReadOnly(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		err := b.Delete([]byte("foo"))
+		equals(t, err, bolt.ErrTxNotWritable)
+		return nil
+	})
+}
+
+// Ensure that a deleting value while the transaction is closed returns an error.
+func TestBucket_Delete_Closed(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(true)
+	tx.CreateBucket([]byte("widgets"))
+	b := tx.Bucket([]byte("widgets"))
+	tx.Rollback()
+	equals(t, bolt.ErrTxClosed, b.Delete([]byte("foo")))
+}
+
+// Ensure that deleting a bucket causes nested buckets to be deleted.
+func TestBucket_DeleteBucket_Nested(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		_, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo"))
+		ok(t, err)
+		_, err = tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).CreateBucket([]byte("bar"))
+		ok(t, err)
+		ok(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).Bucket([]byte("bar")).Put([]byte("baz"), []byte("bat")))
+		ok(t, tx.Bucket([]byte("widgets")).DeleteBucket([]byte("foo")))
+		return nil
+	})
+}
+
+// Ensure that deleting a bucket causes nested buckets to be deleted after they have been committed.
+func TestBucket_DeleteBucket_Nested2(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		_, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo"))
+		ok(t, err)
+		_, err = tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).CreateBucket([]byte("bar"))
+		ok(t, err)
+		ok(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).Bucket([]byte("bar")).Put([]byte("baz"), []byte("bat")))
+		return nil
+	})
+	db.Update(func(tx *bolt.Tx) error {
+		assert(t, tx.Bucket([]byte("widgets")) != nil, "")
+		assert(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")) != nil, "")
+		assert(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).Bucket([]byte("bar")) != nil, "")
+		equals(t, []byte("bat"), tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).Bucket([]byte("bar")).Get([]byte("baz")))
+		ok(t, tx.DeleteBucket([]byte("widgets")))
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		assert(t, tx.Bucket([]byte("widgets")) == nil, "")
+		return nil
+	})
+}
+
+// Ensure that deleting a child bucket with multiple pages causes all pages to get collected.
+func TestBucket_DeleteBucket_Large(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		ok(t, err)
+		_, err = tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo"))
+		ok(t, err)
+		b := tx.Bucket([]byte("widgets")).Bucket([]byte("foo"))
+		for i := 0; i < 1000; i++ {
+			ok(t, b.Put([]byte(fmt.Sprintf("%d", i)), []byte(fmt.Sprintf("%0100d", i))))
+		}
+		return nil
+	})
+	db.Update(func(tx *bolt.Tx) error {
+		ok(t, tx.DeleteBucket([]byte("widgets")))
+		return nil
+	})
+
+	// NOTE: Consistency check in TestDB.Close() will panic if pages not freed properly.
+}
+
+// Ensure that a simple value retrieved via Bucket() returns a nil.
+func TestBucket_Bucket_IncompatibleValue(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		ok(t, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")))
+		assert(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")) == nil, "")
+		return nil
+	})
+}
+
+// Ensure that creating a bucket on an existing non-bucket key returns an error.
+func TestBucket_CreateBucket_IncompatibleValue(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		ok(t, err)
+		ok(t, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")))
+		_, err = tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo"))
+		equals(t, bolt.ErrIncompatibleValue, err)
+		return nil
+	})
+}
+
+// Ensure that deleting a bucket on an existing non-bucket key returns an error.
+func TestBucket_DeleteBucket_IncompatibleValue(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		ok(t, err)
+		ok(t, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")))
+		equals(t, bolt.ErrIncompatibleValue, tx.Bucket([]byte("widgets")).DeleteBucket([]byte("foo")))
+		return nil
+	})
+}
+
+// Ensure that a bucket can return an autoincrementing sequence.
+func TestBucket_NextSequence(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.CreateBucket([]byte("woojits"))
+
+		// Make sure sequence increments.
+		seq, err := tx.Bucket([]byte("widgets")).NextSequence()
+		ok(t, err)
+		equals(t, seq, uint64(1))
+		seq, err = tx.Bucket([]byte("widgets")).NextSequence()
+		ok(t, err)
+		equals(t, seq, uint64(2))
+
+		// Buckets should be separate.
+		seq, err = tx.Bucket([]byte("woojits")).NextSequence()
+		ok(t, err)
+		equals(t, seq, uint64(1))
+		return nil
+	})
+}
+
+// Ensure that a bucket will persist an autoincrementing sequence even if its
+// the only thing updated on the bucket.
+// https://github.com/boltdb/bolt/issues/296
+func TestBucket_NextSequence_Persist(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		_, _ = tx.CreateBucket([]byte("widgets"))
+		return nil
+	})
+
+	db.Update(func(tx *bolt.Tx) error {
+		_, _ = tx.Bucket([]byte("widgets")).NextSequence()
+		return nil
+	})
+
+	db.Update(func(tx *bolt.Tx) error {
+		seq, err := tx.Bucket([]byte("widgets")).NextSequence()
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		} else if seq != 2 {
+			t.Fatalf("unexpected sequence: %d", seq)
+		}
+		return nil
+	})
+}
+
+// Ensure that retrieving the next sequence on a read-only bucket returns an error.
+func TestBucket_NextSequence_ReadOnly(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		i, err := b.NextSequence()
+		equals(t, i, uint64(0))
+		equals(t, err, bolt.ErrTxNotWritable)
+		return nil
+	})
+}
+
+// Ensure that retrieving the next sequence for a bucket on a closed database return an error.
+func TestBucket_NextSequence_Closed(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(true)
+	tx.CreateBucket([]byte("widgets"))
+	b := tx.Bucket([]byte("widgets"))
+	tx.Rollback()
+	_, err := b.NextSequence()
+	equals(t, bolt.ErrTxClosed, err)
+}
+
+// Ensure a user can loop over all key/value pairs in a bucket.
+func TestBucket_ForEach(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("0000"))
+		tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("0001"))
+		tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte("0002"))
+
+		var index int
+		err := tx.Bucket([]byte("widgets")).ForEach(func(k, v []byte) error {
+			switch index {
+			case 0:
+				equals(t, k, []byte("bar"))
+				equals(t, v, []byte("0002"))
+			case 1:
+				equals(t, k, []byte("baz"))
+				equals(t, v, []byte("0001"))
+			case 2:
+				equals(t, k, []byte("foo"))
+				equals(t, v, []byte("0000"))
+			}
+			index++
+			return nil
+		})
+		ok(t, err)
+		equals(t, index, 3)
+		return nil
+	})
+}
+
+// Ensure a database can stop iteration early.
+func TestBucket_ForEach_ShortCircuit(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte("0000"))
+		tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("0000"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("0000"))
+
+		var index int
+		err := tx.Bucket([]byte("widgets")).ForEach(func(k, v []byte) error {
+			index++
+			if bytes.Equal(k, []byte("baz")) {
+				return errors.New("marker")
+			}
+			return nil
+		})
+		equals(t, errors.New("marker"), err)
+		equals(t, 2, index)
+		return nil
+	})
+}
+
+// Ensure that looping over a bucket on a closed database returns an error.
+func TestBucket_ForEach_Closed(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(true)
+	tx.CreateBucket([]byte("widgets"))
+	b := tx.Bucket([]byte("widgets"))
+	tx.Rollback()
+	err := b.ForEach(func(k, v []byte) error { return nil })
+	equals(t, bolt.ErrTxClosed, err)
+}
+
+// Ensure that an error is returned when inserting with an empty key.
+func TestBucket_Put_EmptyKey(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		err := tx.Bucket([]byte("widgets")).Put([]byte(""), []byte("bar"))
+		equals(t, err, bolt.ErrKeyRequired)
+		err = tx.Bucket([]byte("widgets")).Put(nil, []byte("bar"))
+		equals(t, err, bolt.ErrKeyRequired)
+		return nil
+	})
+}
+
+// Ensure that an error is returned when inserting with a key that's too large.
+func TestBucket_Put_KeyTooLarge(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		err := tx.Bucket([]byte("widgets")).Put(make([]byte, 32769), []byte("bar"))
+		equals(t, err, bolt.ErrKeyTooLarge)
+		return nil
+	})
+}
+
+// Ensure a bucket can calculate stats.
+func TestBucket_Stats(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	// Add bucket with fewer keys but one big value.
+	big_key := []byte("really-big-value")
+	for i := 0; i < 500; i++ {
+		db.Update(func(tx *bolt.Tx) error {
+			b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+			return b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i)))
+		})
+	}
+	db.Update(func(tx *bolt.Tx) error {
+		b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+		return b.Put(big_key, []byte(strings.Repeat("*", 10000)))
+	})
+
+	db.MustCheck()
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("woojits"))
+		stats := b.Stats()
+		equals(t, 1, stats.BranchPageN)
+		equals(t, 0, stats.BranchOverflowN)
+		equals(t, 7, stats.LeafPageN)
+		equals(t, 2, stats.LeafOverflowN)
+		equals(t, 501, stats.KeyN)
+		equals(t, 2, stats.Depth)
+
+		branchInuse := 16     // branch page header
+		branchInuse += 7 * 16 // branch elements
+		branchInuse += 7 * 3  // branch keys (6 3-byte keys)
+		equals(t, branchInuse, stats.BranchInuse)
+
+		leafInuse := 7 * 16                      // leaf page header
+		leafInuse += 501 * 16                    // leaf elements
+		leafInuse += 500*3 + len(big_key)        // leaf keys
+		leafInuse += 1*10 + 2*90 + 3*400 + 10000 // leaf values
+		equals(t, leafInuse, stats.LeafInuse)
+
+		if os.Getpagesize() == 4096 {
+			// Incompatible page size
+			equals(t, 4096, stats.BranchAlloc)
+			equals(t, 36864, stats.LeafAlloc)
+		}
+
+		equals(t, 1, stats.BucketN)
+		equals(t, 0, stats.InlineBucketN)
+		equals(t, 0, stats.InlineBucketInuse)
+		return nil
+	})
+}
+
+// Ensure a bucket with random insertion utilizes fill percentage correctly.
+func TestBucket_Stats_RandomFill(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	} else if os.Getpagesize() != 4096 {
+		t.Skip("invalid page size for test")
+	}
+
+	db := NewTestDB()
+	defer db.Close()
+
+	// Add a set of values in random order. It will be the same random
+	// order so we can maintain consistency between test runs.
+	var count int
+	r := rand.New(rand.NewSource(42))
+	for _, i := range r.Perm(1000) {
+		db.Update(func(tx *bolt.Tx) error {
+			b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+			b.FillPercent = 0.9
+			for _, j := range r.Perm(100) {
+				index := (j * 10000) + i
+				b.Put([]byte(fmt.Sprintf("%d000000000000000", index)), []byte("0000000000"))
+				count++
+			}
+			return nil
+		})
+	}
+	db.MustCheck()
+
+	db.View(func(tx *bolt.Tx) error {
+		s := tx.Bucket([]byte("woojits")).Stats()
+		equals(t, 100000, s.KeyN)
+
+		equals(t, 98, s.BranchPageN)
+		equals(t, 0, s.BranchOverflowN)
+		equals(t, 130984, s.BranchInuse)
+		equals(t, 401408, s.BranchAlloc)
+
+		equals(t, 3412, s.LeafPageN)
+		equals(t, 0, s.LeafOverflowN)
+		equals(t, 4742482, s.LeafInuse)
+		equals(t, 13975552, s.LeafAlloc)
+		return nil
+	})
+}
+
+// Ensure a bucket can calculate stats.
+func TestBucket_Stats_Small(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		// Add a bucket that fits on a single root leaf.
+		b, err := tx.CreateBucket([]byte("whozawhats"))
+		ok(t, err)
+		b.Put([]byte("foo"), []byte("bar"))
+
+		return nil
+	})
+	db.MustCheck()
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("whozawhats"))
+		stats := b.Stats()
+		equals(t, 0, stats.BranchPageN)
+		equals(t, 0, stats.BranchOverflowN)
+		equals(t, 0, stats.LeafPageN)
+		equals(t, 0, stats.LeafOverflowN)
+		equals(t, 1, stats.KeyN)
+		equals(t, 1, stats.Depth)
+		equals(t, 0, stats.BranchInuse)
+		equals(t, 0, stats.LeafInuse)
+		if os.Getpagesize() == 4096 {
+			// Incompatible page size
+			equals(t, 0, stats.BranchAlloc)
+			equals(t, 0, stats.LeafAlloc)
+		}
+		equals(t, 1, stats.BucketN)
+		equals(t, 1, stats.InlineBucketN)
+		equals(t, 16+16+6, stats.InlineBucketInuse)
+		return nil
+	})
+}
+
+func TestBucket_Stats_EmptyBucket(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	db.Update(func(tx *bolt.Tx) error {
+		// Add a bucket that fits on a single root leaf.
+		_, err := tx.CreateBucket([]byte("whozawhats"))
+		ok(t, err)
+		return nil
+	})
+	db.MustCheck()
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("whozawhats"))
+		stats := b.Stats()
+		equals(t, 0, stats.BranchPageN)
+		equals(t, 0, stats.BranchOverflowN)
+		equals(t, 0, stats.LeafPageN)
+		equals(t, 0, stats.LeafOverflowN)
+		equals(t, 0, stats.KeyN)
+		equals(t, 1, stats.Depth)
+		equals(t, 0, stats.BranchInuse)
+		equals(t, 0, stats.LeafInuse)
+		if os.Getpagesize() == 4096 {
+			// Incompatible page size
+			equals(t, 0, stats.BranchAlloc)
+			equals(t, 0, stats.LeafAlloc)
+		}
+		equals(t, 1, stats.BucketN)
+		equals(t, 1, stats.InlineBucketN)
+		equals(t, 16, stats.InlineBucketInuse)
+		return nil
+	})
+}
+
+// Ensure a bucket can calculate stats.
+func TestBucket_Stats_Nested(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("foo"))
+		ok(t, err)
+		for i := 0; i < 100; i++ {
+			b.Put([]byte(fmt.Sprintf("%02d", i)), []byte(fmt.Sprintf("%02d", i)))
+		}
+		bar, err := b.CreateBucket([]byte("bar"))
+		ok(t, err)
+		for i := 0; i < 10; i++ {
+			bar.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i)))
+		}
+		baz, err := bar.CreateBucket([]byte("baz"))
+		ok(t, err)
+		for i := 0; i < 10; i++ {
+			baz.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i)))
+		}
+		return nil
+	})
+
+	db.MustCheck()
+
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("foo"))
+		stats := b.Stats()
+		equals(t, 0, stats.BranchPageN)
+		equals(t, 0, stats.BranchOverflowN)
+		equals(t, 2, stats.LeafPageN)
+		equals(t, 0, stats.LeafOverflowN)
+		equals(t, 122, stats.KeyN)
+		equals(t, 3, stats.Depth)
+		equals(t, 0, stats.BranchInuse)
+
+		foo := 16            // foo (pghdr)
+		foo += 101 * 16      // foo leaf elements
+		foo += 100*2 + 100*2 // foo leaf key/values
+		foo += 3 + 16        // foo -> bar key/value
+
+		bar := 16      // bar (pghdr)
+		bar += 11 * 16 // bar leaf elements
+		bar += 10 + 10 // bar leaf key/values
+		bar += 3 + 16  // bar -> baz key/value
+
+		baz := 16      // baz (inline) (pghdr)
+		baz += 10 * 16 // baz leaf elements
+		baz += 10 + 10 // baz leaf key/values
+
+		equals(t, foo+bar+baz, stats.LeafInuse)
+		if os.Getpagesize() == 4096 {
+			// Incompatible page size
+			equals(t, 0, stats.BranchAlloc)
+			equals(t, 8192, stats.LeafAlloc)
+		}
+		equals(t, 3, stats.BucketN)
+		equals(t, 1, stats.InlineBucketN)
+		equals(t, baz, stats.InlineBucketInuse)
+		return nil
+	})
+}
+
+// Ensure a large bucket can calculate stats.
+func TestBucket_Stats_Large(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+
+	db := NewTestDB()
+	defer db.Close()
+
+	var index int
+	for i := 0; i < 100; i++ {
+		db.Update(func(tx *bolt.Tx) error {
+			// Add bucket with lots of keys.
+			b, _ := tx.CreateBucketIfNotExists([]byte("widgets"))
+			for i := 0; i < 1000; i++ {
+				b.Put([]byte(strconv.Itoa(index)), []byte(strconv.Itoa(index)))
+				index++
+			}
+			return nil
+		})
+	}
+	db.MustCheck()
+
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		stats := b.Stats()
+		equals(t, 13, stats.BranchPageN)
+		equals(t, 0, stats.BranchOverflowN)
+		equals(t, 1196, stats.LeafPageN)
+		equals(t, 0, stats.LeafOverflowN)
+		equals(t, 100000, stats.KeyN)
+		equals(t, 3, stats.Depth)
+		equals(t, 25257, stats.BranchInuse)
+		equals(t, 2596916, stats.LeafInuse)
+		if os.Getpagesize() == 4096 {
+			// Incompatible page size
+			equals(t, 53248, stats.BranchAlloc)
+			equals(t, 4898816, stats.LeafAlloc)
+		}
+		equals(t, 1, stats.BucketN)
+		equals(t, 0, stats.InlineBucketN)
+		equals(t, 0, stats.InlineBucketInuse)
+		return nil
+	})
+}
+
+// Ensure that a bucket can write random keys and values across multiple transactions.
+func TestBucket_Put_Single(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+
+	index := 0
+	f := func(items testdata) bool {
+		db := NewTestDB()
+		defer db.Close()
+
+		m := make(map[string][]byte)
+
+		db.Update(func(tx *bolt.Tx) error {
+			_, err := tx.CreateBucket([]byte("widgets"))
+			return err
+		})
+		for _, item := range items {
+			db.Update(func(tx *bolt.Tx) error {
+				if err := tx.Bucket([]byte("widgets")).Put(item.Key, item.Value); err != nil {
+					panic("put error: " + err.Error())
+				}
+				m[string(item.Key)] = item.Value
+				return nil
+			})
+
+			// Verify all key/values so far.
+			db.View(func(tx *bolt.Tx) error {
+				i := 0
+				for k, v := range m {
+					value := tx.Bucket([]byte("widgets")).Get([]byte(k))
+					if !bytes.Equal(value, v) {
+						t.Logf("value mismatch [run %d] (%d of %d):\nkey: %x\ngot: %x\nexp: %x", index, i, len(m), []byte(k), value, v)
+						db.CopyTempFile()
+						t.FailNow()
+					}
+					i++
+				}
+				return nil
+			})
+		}
+
+		index++
+		return true
+	}
+	if err := quick.Check(f, qconfig()); err != nil {
+		t.Error(err)
+	}
+}
+
+// Ensure that a transaction can insert multiple key/value pairs at once.
+func TestBucket_Put_Multiple(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+
+	f := func(items testdata) bool {
+		db := NewTestDB()
+		defer db.Close()
+		// Bulk insert all values.
+		db.Update(func(tx *bolt.Tx) error {
+			_, err := tx.CreateBucket([]byte("widgets"))
+			return err
+		})
+		err := db.Update(func(tx *bolt.Tx) error {
+			b := tx.Bucket([]byte("widgets"))
+			for _, item := range items {
+				ok(t, b.Put(item.Key, item.Value))
+			}
+			return nil
+		})
+		ok(t, err)
+
+		// Verify all items exist.
+		db.View(func(tx *bolt.Tx) error {
+			b := tx.Bucket([]byte("widgets"))
+			for _, item := range items {
+				value := b.Get(item.Key)
+				if !bytes.Equal(item.Value, value) {
+					db.CopyTempFile()
+					t.Fatalf("exp=%x; got=%x", item.Value, value)
+				}
+			}
+			return nil
+		})
+		return true
+	}
+	if err := quick.Check(f, qconfig()); err != nil {
+		t.Error(err)
+	}
+}
+
+// Ensure that a transaction can delete all key/value pairs and return to a single leaf page.
+func TestBucket_Delete_Quick(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+
+	f := func(items testdata) bool {
+		db := NewTestDB()
+		defer db.Close()
+		// Bulk insert all values.
+		db.Update(func(tx *bolt.Tx) error {
+			_, err := tx.CreateBucket([]byte("widgets"))
+			return err
+		})
+		err := db.Update(func(tx *bolt.Tx) error {
+			b := tx.Bucket([]byte("widgets"))
+			for _, item := range items {
+				ok(t, b.Put(item.Key, item.Value))
+			}
+			return nil
+		})
+		ok(t, err)
+
+		// Remove items one at a time and check consistency.
+		for _, item := range items {
+			err := db.Update(func(tx *bolt.Tx) error {
+				return tx.Bucket([]byte("widgets")).Delete(item.Key)
+			})
+			ok(t, err)
+		}
+
+		// Anything before our deletion index should be nil.
+		db.View(func(tx *bolt.Tx) error {
+			tx.Bucket([]byte("widgets")).ForEach(func(k, v []byte) error {
+				t.Fatalf("bucket should be empty; found: %06x", trunc(k, 3))
+				return nil
+			})
+			return nil
+		})
+		return true
+	}
+	if err := quick.Check(f, qconfig()); err != nil {
+		t.Error(err)
+	}
+}
+
+func ExampleBucket_Put() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Start a write transaction.
+	db.Update(func(tx *bolt.Tx) error {
+		// Create a bucket.
+		tx.CreateBucket([]byte("widgets"))
+
+		// Set the value "bar" for the key "foo".
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		return nil
+	})
+
+	// Read value back in a different read-only transaction.
+	db.View(func(tx *bolt.Tx) error {
+		value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+		fmt.Printf("The value of 'foo' is: %s\n", value)
+		return nil
+	})
+
+	// Output:
+	// The value of 'foo' is: bar
+}
+
+func ExampleBucket_Delete() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Start a write transaction.
+	db.Update(func(tx *bolt.Tx) error {
+		// Create a bucket.
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+
+		// Set the value "bar" for the key "foo".
+		b.Put([]byte("foo"), []byte("bar"))
+
+		// Retrieve the key back from the database and verify it.
+		value := b.Get([]byte("foo"))
+		fmt.Printf("The value of 'foo' was: %s\n", value)
+		return nil
+	})
+
+	// Delete the key in a different write transaction.
+	db.Update(func(tx *bolt.Tx) error {
+		return tx.Bucket([]byte("widgets")).Delete([]byte("foo"))
+	})
+
+	// Retrieve the key again.
+	db.View(func(tx *bolt.Tx) error {
+		value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+		if value == nil {
+			fmt.Printf("The value of 'foo' is now: nil\n")
+		}
+		return nil
+	})
+
+	// Output:
+	// The value of 'foo' was: bar
+	// The value of 'foo' is now: nil
+}
+
+func ExampleBucket_ForEach() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Insert data into a bucket.
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("animals"))
+		b := tx.Bucket([]byte("animals"))
+		b.Put([]byte("dog"), []byte("fun"))
+		b.Put([]byte("cat"), []byte("lame"))
+		b.Put([]byte("liger"), []byte("awesome"))
+
+		// Iterate over items in sorted key order.
+		b.ForEach(func(k, v []byte) error {
+			fmt.Printf("A %s is %s.\n", k, v)
+			return nil
+		})
+		return nil
+	})
+
+	// Output:
+	// A cat is lame.
+	// A dog is fun.
+	// A liger is awesome.
+}

BIN
Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/bolt


+ 1529 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main.go

@@ -0,0 +1,1529 @@
+package main
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"runtime"
+	"runtime/pprof"
+	"strconv"
+	"strings"
+	"time"
+	"unicode"
+	"unicode/utf8"
+	"unsafe"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+var (
+	// ErrUsage is returned when a usage message was printed and the process
+	// should simply exit with an error.
+	ErrUsage = errors.New("usage")
+
+	// ErrUnknownCommand is returned when a CLI command is not specified.
+	ErrUnknownCommand = errors.New("unknown command")
+
+	// ErrPathRequired is returned when the path to a Bolt database is not specified.
+	ErrPathRequired = errors.New("path required")
+
+	// ErrFileNotFound is returned when a Bolt database does not exist.
+	ErrFileNotFound = errors.New("file not found")
+
+	// ErrInvalidValue is returned when a benchmark reads an unexpected value.
+	ErrInvalidValue = errors.New("invalid value")
+
+	// ErrCorrupt is returned when a checking a data file finds errors.
+	ErrCorrupt = errors.New("invalid value")
+
+	// ErrNonDivisibleBatchSize is returned when the batch size can't be evenly
+	// divided by the iteration count.
+	ErrNonDivisibleBatchSize = errors.New("number of iterations must be divisible by the batch size")
+
+	// ErrPageIDRequired is returned when a required page id is not specified.
+	ErrPageIDRequired = errors.New("page id required")
+
+	// ErrPageNotFound is returned when specifying a page above the high water mark.
+	ErrPageNotFound = errors.New("page not found")
+
+	// ErrPageFreed is returned when reading a page that has already been freed.
+	ErrPageFreed = errors.New("page freed")
+)
+
+// PageHeaderSize represents the size of the bolt.page header.
+const PageHeaderSize = 16
+
+func main() {
+	m := NewMain()
+	if err := m.Run(os.Args[1:]...); err == ErrUsage {
+		os.Exit(2)
+	} else if err != nil {
+		fmt.Println(err.Error())
+		os.Exit(1)
+	}
+}
+
+// Main represents the main program execution.
+type Main struct {
+	Stdin  io.Reader
+	Stdout io.Writer
+	Stderr io.Writer
+}
+
+// NewMain returns a new instance of Main connect to the standard input/output.
+func NewMain() *Main {
+	return &Main{
+		Stdin:  os.Stdin,
+		Stdout: os.Stdout,
+		Stderr: os.Stderr,
+	}
+}
+
+// Run executes the program.
+func (m *Main) Run(args ...string) error {
+	// Require a command at the beginning.
+	if len(args) == 0 || strings.HasPrefix(args[0], "-") {
+		fmt.Fprintln(m.Stderr, m.Usage())
+		return ErrUsage
+	}
+
+	// Execute command.
+	switch args[0] {
+	case "help":
+		fmt.Fprintln(m.Stderr, m.Usage())
+		return ErrUsage
+	case "bench":
+		return newBenchCommand(m).Run(args[1:]...)
+	case "check":
+		return newCheckCommand(m).Run(args[1:]...)
+	case "dump":
+		return newDumpCommand(m).Run(args[1:]...)
+	case "info":
+		return newInfoCommand(m).Run(args[1:]...)
+	case "page":
+		return newPageCommand(m).Run(args[1:]...)
+	case "pages":
+		return newPagesCommand(m).Run(args[1:]...)
+	case "stats":
+		return newStatsCommand(m).Run(args[1:]...)
+	default:
+		return ErrUnknownCommand
+	}
+}
+
+// Usage returns the help message.
+func (m *Main) Usage() string {
+	return strings.TrimLeft(`
+Bolt is a tool for inspecting bolt databases.
+
+Usage:
+
+	bolt command [arguments]
+
+The commands are:
+
+    bench       run synthetic benchmark against bolt
+    check       verifies integrity of bolt database
+    info        print basic info
+    help        print this screen
+    pages       print list of pages with their types
+    stats       iterate over all pages and generate usage stats
+
+Use "bolt [command] -h" for more information about a command.
+`, "\n")
+}
+
+// CheckCommand represents the "check" command execution.
+type CheckCommand struct {
+	Stdin  io.Reader
+	Stdout io.Writer
+	Stderr io.Writer
+}
+
+// NewCheckCommand returns a CheckCommand.
+func newCheckCommand(m *Main) *CheckCommand {
+	return &CheckCommand{
+		Stdin:  m.Stdin,
+		Stdout: m.Stdout,
+		Stderr: m.Stderr,
+	}
+}
+
+// Run executes the command.
+func (cmd *CheckCommand) Run(args ...string) error {
+	// Parse flags.
+	fs := flag.NewFlagSet("", flag.ContinueOnError)
+	help := fs.Bool("h", false, "")
+	if err := fs.Parse(args); err != nil {
+		return err
+	} else if *help {
+		fmt.Fprintln(cmd.Stderr, cmd.Usage())
+		return ErrUsage
+	}
+
+	// Require database path.
+	path := fs.Arg(0)
+	if path == "" {
+		return ErrPathRequired
+	} else if _, err := os.Stat(path); os.IsNotExist(err) {
+		return ErrFileNotFound
+	}
+
+	// Open database.
+	db, err := bolt.Open(path, 0666, nil)
+	if err != nil {
+		return err
+	}
+	defer db.Close()
+
+	// Perform consistency check.
+	return db.View(func(tx *bolt.Tx) error {
+		var count int
+		ch := tx.Check()
+	loop:
+		for {
+			select {
+			case err, ok := <-ch:
+				if !ok {
+					break loop
+				}
+				fmt.Fprintln(cmd.Stdout, err)
+				count++
+			}
+		}
+
+		// Print summary of errors.
+		if count > 0 {
+			fmt.Fprintf(cmd.Stdout, "%d errors found\n", count)
+			return ErrCorrupt
+		}
+
+		// Notify user that database is valid.
+		fmt.Fprintln(cmd.Stdout, "OK")
+		return nil
+	})
+}
+
+// Usage returns the help message.
+func (cmd *CheckCommand) Usage() string {
+	return strings.TrimLeft(`
+usage: bolt check PATH
+
+Check opens a database at PATH and runs an exhaustive check to verify that
+all pages are accessible or are marked as freed. It also verifies that no
+pages are double referenced.
+
+Verification errors will stream out as they are found and the process will
+return after all pages have been checked.
+`, "\n")
+}
+
+// InfoCommand represents the "info" command execution.
+type InfoCommand struct {
+	Stdin  io.Reader
+	Stdout io.Writer
+	Stderr io.Writer
+}
+
+// NewInfoCommand returns a InfoCommand.
+func newInfoCommand(m *Main) *InfoCommand {
+	return &InfoCommand{
+		Stdin:  m.Stdin,
+		Stdout: m.Stdout,
+		Stderr: m.Stderr,
+	}
+}
+
+// Run executes the command.
+func (cmd *InfoCommand) Run(args ...string) error {
+	// Parse flags.
+	fs := flag.NewFlagSet("", flag.ContinueOnError)
+	help := fs.Bool("h", false, "")
+	if err := fs.Parse(args); err != nil {
+		return err
+	} else if *help {
+		fmt.Fprintln(cmd.Stderr, cmd.Usage())
+		return ErrUsage
+	}
+
+	// Require database path.
+	path := fs.Arg(0)
+	if path == "" {
+		return ErrPathRequired
+	} else if _, err := os.Stat(path); os.IsNotExist(err) {
+		return ErrFileNotFound
+	}
+
+	// Open the database.
+	db, err := bolt.Open(path, 0666, nil)
+	if err != nil {
+		return err
+	}
+	defer db.Close()
+
+	// Print basic database info.
+	info := db.Info()
+	fmt.Fprintf(cmd.Stdout, "Page Size: %d\n", info.PageSize)
+
+	return nil
+}
+
+// Usage returns the help message.
+func (cmd *InfoCommand) Usage() string {
+	return strings.TrimLeft(`
+usage: bolt info PATH
+
+Info prints basic information about the Bolt database at PATH.
+`, "\n")
+}
+
+// DumpCommand represents the "dump" command execution.
+type DumpCommand struct {
+	Stdin  io.Reader
+	Stdout io.Writer
+	Stderr io.Writer
+}
+
+// newDumpCommand returns a DumpCommand.
+func newDumpCommand(m *Main) *DumpCommand {
+	return &DumpCommand{
+		Stdin:  m.Stdin,
+		Stdout: m.Stdout,
+		Stderr: m.Stderr,
+	}
+}
+
+// Run executes the command.
+func (cmd *DumpCommand) Run(args ...string) error {
+	// Parse flags.
+	fs := flag.NewFlagSet("", flag.ContinueOnError)
+	help := fs.Bool("h", false, "")
+	if err := fs.Parse(args); err != nil {
+		return err
+	} else if *help {
+		fmt.Fprintln(cmd.Stderr, cmd.Usage())
+		return ErrUsage
+	}
+
+	// Require database path and page id.
+	path := fs.Arg(0)
+	if path == "" {
+		return ErrPathRequired
+	} else if _, err := os.Stat(path); os.IsNotExist(err) {
+		return ErrFileNotFound
+	}
+
+	// Read page ids.
+	pageIDs, err := atois(fs.Args()[1:])
+	if err != nil {
+		return err
+	} else if len(pageIDs) == 0 {
+		return ErrPageIDRequired
+	}
+
+	// Open database to retrieve page size.
+	pageSize, err := ReadPageSize(path)
+	if err != nil {
+		return err
+	}
+
+	// Open database file handler.
+	f, err := os.Open(path)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = f.Close() }()
+
+	// Print each page listed.
+	for i, pageID := range pageIDs {
+		// Print a separator.
+		if i > 0 {
+			fmt.Fprintln(cmd.Stdout, "===============================================\n")
+		}
+
+		// Print page to stdout.
+		if err := cmd.PrintPage(cmd.Stdout, f, pageID, pageSize); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// PrintPage prints a given page as hexidecimal.
+func (cmd *DumpCommand) PrintPage(w io.Writer, r io.ReaderAt, pageID int, pageSize int) error {
+	const bytesPerLineN = 16
+
+	// Read page into buffer.
+	buf := make([]byte, pageSize)
+	addr := pageID * pageSize
+	if n, err := r.ReadAt(buf, int64(addr)); err != nil {
+		return err
+	} else if n != pageSize {
+		return io.ErrUnexpectedEOF
+	}
+
+	// Write out to writer in 16-byte lines.
+	var prev []byte
+	var skipped bool
+	for offset := 0; offset < pageSize; offset += bytesPerLineN {
+		// Retrieve current 16-byte line.
+		line := buf[offset : offset+bytesPerLineN]
+		isLastLine := (offset == (pageSize - bytesPerLineN))
+
+		// If it's the same as the previous line then print a skip.
+		if bytes.Equal(line, prev) && !isLastLine {
+			if !skipped {
+				fmt.Fprintf(w, "%07x *\n", addr+offset)
+				skipped = true
+			}
+		} else {
+			// Print line as hexadecimal in 2-byte groups.
+			fmt.Fprintf(w, "%07x %04x %04x %04x %04x %04x %04x %04x %04x\n", addr+offset,
+				line[0:2], line[2:4], line[4:6], line[6:8],
+				line[8:10], line[10:12], line[12:14], line[14:16],
+			)
+
+			skipped = false
+		}
+
+		// Save the previous line.
+		prev = line
+	}
+	fmt.Fprint(w, "\n")
+
+	return nil
+}
+
+// Usage returns the help message.
+func (cmd *DumpCommand) Usage() string {
+	return strings.TrimLeft(`
+usage: bolt dump -page PAGEID PATH
+
+Dump prints a hexidecimal dump of a single page.
+`, "\n")
+}
+
+// PageCommand represents the "page" command execution.
+type PageCommand struct {
+	Stdin  io.Reader
+	Stdout io.Writer
+	Stderr io.Writer
+}
+
+// newPageCommand returns a PageCommand.
+func newPageCommand(m *Main) *PageCommand {
+	return &PageCommand{
+		Stdin:  m.Stdin,
+		Stdout: m.Stdout,
+		Stderr: m.Stderr,
+	}
+}
+
+// Run executes the command.
+func (cmd *PageCommand) Run(args ...string) error {
+	// Parse flags.
+	fs := flag.NewFlagSet("", flag.ContinueOnError)
+	help := fs.Bool("h", false, "")
+	if err := fs.Parse(args); err != nil {
+		return err
+	} else if *help {
+		fmt.Fprintln(cmd.Stderr, cmd.Usage())
+		return ErrUsage
+	}
+
+	// Require database path and page id.
+	path := fs.Arg(0)
+	if path == "" {
+		return ErrPathRequired
+	} else if _, err := os.Stat(path); os.IsNotExist(err) {
+		return ErrFileNotFound
+	}
+
+	// Read page ids.
+	pageIDs, err := atois(fs.Args()[1:])
+	if err != nil {
+		return err
+	} else if len(pageIDs) == 0 {
+		return ErrPageIDRequired
+	}
+
+	// Open database file handler.
+	f, err := os.Open(path)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = f.Close() }()
+
+	// Print each page listed.
+	for i, pageID := range pageIDs {
+		// Print a separator.
+		if i > 0 {
+			fmt.Fprintln(cmd.Stdout, "===============================================\n")
+		}
+
+		// Retrieve page info and page size.
+		p, buf, err := ReadPage(path, pageID)
+		if err != nil {
+			return err
+		}
+
+		// Print basic page info.
+		fmt.Fprintf(cmd.Stdout, "Page ID:    %d\n", p.id)
+		fmt.Fprintf(cmd.Stdout, "Page Type:  %s\n", p.Type())
+		fmt.Fprintf(cmd.Stdout, "Total Size: %d bytes\n", len(buf))
+
+		// Print type-specific data.
+		switch p.Type() {
+		case "meta":
+			err = cmd.PrintMeta(cmd.Stdout, buf)
+		case "leaf":
+			err = cmd.PrintLeaf(cmd.Stdout, buf)
+		case "branch":
+			err = cmd.PrintBranch(cmd.Stdout, buf)
+		case "freelist":
+			err = cmd.PrintFreelist(cmd.Stdout, buf)
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// PrintMeta prints the data from the meta page.
+func (cmd *PageCommand) PrintMeta(w io.Writer, buf []byte) error {
+	m := (*meta)(unsafe.Pointer(&buf[PageHeaderSize]))
+	fmt.Fprintf(w, "Version:    %d\n", m.version)
+	fmt.Fprintf(w, "Page Size:  %d bytes\n", m.pageSize)
+	fmt.Fprintf(w, "Flags:      %08x\n", m.flags)
+	fmt.Fprintf(w, "Root:       <pgid=%d>\n", m.root.root)
+	fmt.Fprintf(w, "Freelist:   <pgid=%d>\n", m.freelist)
+	fmt.Fprintf(w, "HWM:        <pgid=%d>\n", m.pgid)
+	fmt.Fprintf(w, "Txn ID:     %d\n", m.txid)
+	fmt.Fprintf(w, "Checksum:   %016x\n", m.checksum)
+	fmt.Fprintf(w, "\n")
+	return nil
+}
+
+// PrintLeaf prints the data for a leaf page.
+func (cmd *PageCommand) PrintLeaf(w io.Writer, buf []byte) error {
+	p := (*page)(unsafe.Pointer(&buf[0]))
+
+	// Print number of items.
+	fmt.Fprintf(w, "Item Count: %d\n", p.count)
+	fmt.Fprintf(w, "\n")
+
+	// Print each key/value.
+	for i := uint16(0); i < p.count; i++ {
+		e := p.leafPageElement(i)
+
+		// Format key as string.
+		var k string
+		if isPrintable(string(e.key())) {
+			k = fmt.Sprintf("%q", string(e.key()))
+		} else {
+			k = fmt.Sprintf("%x", string(e.key()))
+		}
+
+		// Format value as string.
+		var v string
+		if (e.flags & uint32(bucketLeafFlag)) != 0 {
+			b := (*bucket)(unsafe.Pointer(&e.value()[0]))
+			v = fmt.Sprintf("<pgid=%d,seq=%d>", b.root, b.sequence)
+		} else if isPrintable(string(e.value())) {
+			k = fmt.Sprintf("%q", string(e.value()))
+		} else {
+			k = fmt.Sprintf("%x", string(e.value()))
+		}
+
+		fmt.Fprintf(w, "%s: %s\n", k, v)
+	}
+	fmt.Fprintf(w, "\n")
+	return nil
+}
+
+// PrintBranch prints the data for a leaf page.
+func (cmd *PageCommand) PrintBranch(w io.Writer, buf []byte) error {
+	p := (*page)(unsafe.Pointer(&buf[0]))
+
+	// Print number of items.
+	fmt.Fprintf(w, "Item Count: %d\n", p.count)
+	fmt.Fprintf(w, "\n")
+
+	// Print each key/value.
+	for i := uint16(0); i < p.count; i++ {
+		e := p.branchPageElement(i)
+
+		// Format key as string.
+		var k string
+		if isPrintable(string(e.key())) {
+			k = fmt.Sprintf("%q", string(e.key()))
+		} else {
+			k = fmt.Sprintf("%x", string(e.key()))
+		}
+
+		fmt.Fprintf(w, "%s: <pgid=%d>\n", k, e.pgid)
+	}
+	fmt.Fprintf(w, "\n")
+	return nil
+}
+
+// PrintFreelist prints the data for a freelist page.
+func (cmd *PageCommand) PrintFreelist(w io.Writer, buf []byte) error {
+	p := (*page)(unsafe.Pointer(&buf[0]))
+
+	// Print number of items.
+	fmt.Fprintf(w, "Item Count: %d\n", p.count)
+	fmt.Fprintf(w, "\n")
+
+	// Print each page in the freelist.
+	ids := (*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr))
+	for i := uint16(0); i < p.count; i++ {
+		fmt.Fprintf(w, "%d\n", ids[i])
+	}
+	fmt.Fprintf(w, "\n")
+	return nil
+}
+
+// PrintPage prints a given page as hexidecimal.
+func (cmd *PageCommand) PrintPage(w io.Writer, r io.ReaderAt, pageID int, pageSize int) error {
+	const bytesPerLineN = 16
+
+	// Read page into buffer.
+	buf := make([]byte, pageSize)
+	addr := pageID * pageSize
+	if n, err := r.ReadAt(buf, int64(addr)); err != nil {
+		return err
+	} else if n != pageSize {
+		return io.ErrUnexpectedEOF
+	}
+
+	// Write out to writer in 16-byte lines.
+	var prev []byte
+	var skipped bool
+	for offset := 0; offset < pageSize; offset += bytesPerLineN {
+		// Retrieve current 16-byte line.
+		line := buf[offset : offset+bytesPerLineN]
+		isLastLine := (offset == (pageSize - bytesPerLineN))
+
+		// If it's the same as the previous line then print a skip.
+		if bytes.Equal(line, prev) && !isLastLine {
+			if !skipped {
+				fmt.Fprintf(w, "%07x *\n", addr+offset)
+				skipped = true
+			}
+		} else {
+			// Print line as hexadecimal in 2-byte groups.
+			fmt.Fprintf(w, "%07x %04x %04x %04x %04x %04x %04x %04x %04x\n", addr+offset,
+				line[0:2], line[2:4], line[4:6], line[6:8],
+				line[8:10], line[10:12], line[12:14], line[14:16],
+			)
+
+			skipped = false
+		}
+
+		// Save the previous line.
+		prev = line
+	}
+	fmt.Fprint(w, "\n")
+
+	return nil
+}
+
+// Usage returns the help message.
+func (cmd *PageCommand) Usage() string {
+	return strings.TrimLeft(`
+usage: bolt page -page PATH pageid [pageid...]
+
+Page prints one or more pages in human readable format.
+`, "\n")
+}
+
+// PagesCommand represents the "pages" command execution.
+type PagesCommand struct {
+	Stdin  io.Reader
+	Stdout io.Writer
+	Stderr io.Writer
+}
+
+// NewPagesCommand returns a PagesCommand.
+func newPagesCommand(m *Main) *PagesCommand {
+	return &PagesCommand{
+		Stdin:  m.Stdin,
+		Stdout: m.Stdout,
+		Stderr: m.Stderr,
+	}
+}
+
+// Run executes the command.
+func (cmd *PagesCommand) Run(args ...string) error {
+	// Parse flags.
+	fs := flag.NewFlagSet("", flag.ContinueOnError)
+	help := fs.Bool("h", false, "")
+	if err := fs.Parse(args); err != nil {
+		return err
+	} else if *help {
+		fmt.Fprintln(cmd.Stderr, cmd.Usage())
+		return ErrUsage
+	}
+
+	// Require database path.
+	path := fs.Arg(0)
+	if path == "" {
+		return ErrPathRequired
+	} else if _, err := os.Stat(path); os.IsNotExist(err) {
+		return ErrFileNotFound
+	}
+
+	// Open database.
+	db, err := bolt.Open(path, 0666, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = db.Close() }()
+
+	// Write header.
+	fmt.Fprintln(cmd.Stdout, "ID       TYPE       ITEMS  OVRFLW")
+	fmt.Fprintln(cmd.Stdout, "======== ========== ====== ======")
+
+	return db.Update(func(tx *bolt.Tx) error {
+		var id int
+		for {
+			p, err := tx.Page(id)
+			if err != nil {
+				return &PageError{ID: id, Err: err}
+			} else if p == nil {
+				break
+			}
+
+			// Only display count and overflow if this is a non-free page.
+			var count, overflow string
+			if p.Type != "free" {
+				count = strconv.Itoa(p.Count)
+				if p.OverflowCount > 0 {
+					overflow = strconv.Itoa(p.OverflowCount)
+				}
+			}
+
+			// Print table row.
+			fmt.Fprintf(cmd.Stdout, "%-8d %-10s %-6s %-6s\n", p.ID, p.Type, count, overflow)
+
+			// Move to the next non-overflow page.
+			id += 1
+			if p.Type != "free" {
+				id += p.OverflowCount
+			}
+		}
+		return nil
+	})
+}
+
+// Usage returns the help message.
+func (cmd *PagesCommand) Usage() string {
+	return strings.TrimLeft(`
+usage: bolt pages PATH
+
+Pages prints a table of pages with their type (meta, leaf, branch, freelist).
+Leaf and branch pages will show a key count in the "items" column while the
+freelist will show the number of free pages in the "items" column.
+
+The "overflow" column shows the number of blocks that the page spills over
+into. Normally there is no overflow but large keys and values can cause
+a single page to take up multiple blocks.
+`, "\n")
+}
+
+// StatsCommand represents the "stats" command execution.
+type StatsCommand struct {
+	Stdin  io.Reader
+	Stdout io.Writer
+	Stderr io.Writer
+}
+
+// NewStatsCommand returns a StatsCommand.
+func newStatsCommand(m *Main) *StatsCommand {
+	return &StatsCommand{
+		Stdin:  m.Stdin,
+		Stdout: m.Stdout,
+		Stderr: m.Stderr,
+	}
+}
+
+// Run executes the command.
+func (cmd *StatsCommand) Run(args ...string) error {
+	// Parse flags.
+	fs := flag.NewFlagSet("", flag.ContinueOnError)
+	help := fs.Bool("h", false, "")
+	if err := fs.Parse(args); err != nil {
+		return err
+	} else if *help {
+		fmt.Fprintln(cmd.Stderr, cmd.Usage())
+		return ErrUsage
+	}
+
+	// Require database path.
+	path, prefix := fs.Arg(0), fs.Arg(1)
+	if path == "" {
+		return ErrPathRequired
+	} else if _, err := os.Stat(path); os.IsNotExist(err) {
+		return ErrFileNotFound
+	}
+
+	// Open database.
+	db, err := bolt.Open(path, 0666, nil)
+	if err != nil {
+		return err
+	}
+	defer db.Close()
+
+	return db.View(func(tx *bolt.Tx) error {
+		var s bolt.BucketStats
+		var count int
+		if err := tx.ForEach(func(name []byte, b *bolt.Bucket) error {
+			if bytes.HasPrefix(name, []byte(prefix)) {
+				s.Add(b.Stats())
+				count += 1
+			}
+			return nil
+		}); err != nil {
+			return err
+		}
+
+		fmt.Fprintf(cmd.Stdout, "Aggregate statistics for %d buckets\n\n", count)
+
+		fmt.Fprintln(cmd.Stdout, "Page count statistics")
+		fmt.Fprintf(cmd.Stdout, "\tNumber of logical branch pages: %d\n", s.BranchPageN)
+		fmt.Fprintf(cmd.Stdout, "\tNumber of physical branch overflow pages: %d\n", s.BranchOverflowN)
+		fmt.Fprintf(cmd.Stdout, "\tNumber of logical leaf pages: %d\n", s.LeafPageN)
+		fmt.Fprintf(cmd.Stdout, "\tNumber of physical leaf overflow pages: %d\n", s.LeafOverflowN)
+
+		fmt.Fprintln(cmd.Stdout, "Tree statistics")
+		fmt.Fprintf(cmd.Stdout, "\tNumber of keys/value pairs: %d\n", s.KeyN)
+		fmt.Fprintf(cmd.Stdout, "\tNumber of levels in B+tree: %d\n", s.Depth)
+
+		fmt.Fprintln(cmd.Stdout, "Page size utilization")
+		fmt.Fprintf(cmd.Stdout, "\tBytes allocated for physical branch pages: %d\n", s.BranchAlloc)
+		var percentage int
+		if s.BranchAlloc != 0 {
+			percentage = int(float32(s.BranchInuse) * 100.0 / float32(s.BranchAlloc))
+		}
+		fmt.Fprintf(cmd.Stdout, "\tBytes actually used for branch data: %d (%d%%)\n", s.BranchInuse, percentage)
+		fmt.Fprintf(cmd.Stdout, "\tBytes allocated for physical leaf pages: %d\n", s.LeafAlloc)
+		percentage = 0
+		if s.LeafAlloc != 0 {
+			percentage = int(float32(s.LeafInuse) * 100.0 / float32(s.LeafAlloc))
+		}
+		fmt.Fprintf(cmd.Stdout, "\tBytes actually used for leaf data: %d (%d%%)\n", s.LeafInuse, percentage)
+
+		fmt.Fprintln(cmd.Stdout, "Bucket statistics")
+		fmt.Fprintf(cmd.Stdout, "\tTotal number of buckets: %d\n", s.BucketN)
+		percentage = int(float32(s.InlineBucketN) * 100.0 / float32(s.BucketN))
+		fmt.Fprintf(cmd.Stdout, "\tTotal number on inlined buckets: %d (%d%%)\n", s.InlineBucketN, percentage)
+		percentage = 0
+		if s.LeafInuse != 0 {
+			percentage = int(float32(s.InlineBucketInuse) * 100.0 / float32(s.LeafInuse))
+		}
+		fmt.Fprintf(cmd.Stdout, "\tBytes used for inlined buckets: %d (%d%%)\n", s.InlineBucketInuse, percentage)
+
+		return nil
+	})
+}
+
+// Usage returns the help message.
+func (cmd *StatsCommand) Usage() string {
+	return strings.TrimLeft(`
+usage: bolt stats PATH
+
+Stats performs an extensive search of the database to track every page
+reference. It starts at the current meta page and recursively iterates
+through every accessible bucket.
+
+The following errors can be reported:
+
+    already freed
+        The page is referenced more than once in the freelist.
+
+    unreachable unfreed
+        The page is not referenced by a bucket or in the freelist.
+
+    reachable freed
+        The page is referenced by a bucket but is also in the freelist.
+
+    out of bounds
+        A page is referenced that is above the high water mark.
+
+    multiple references
+        A page is referenced by more than one other page.
+
+    invalid type
+        The page type is not "meta", "leaf", "branch", or "freelist".
+
+No errors should occur in your database. However, if for some reason you
+experience corruption, please submit a ticket to the Bolt project page:
+
+  https://github.com/boltdb/bolt/issues
+`, "\n")
+}
+
+var benchBucketName = []byte("bench")
+
+// BenchCommand represents the "bench" command execution.
+type BenchCommand struct {
+	Stdin  io.Reader
+	Stdout io.Writer
+	Stderr io.Writer
+}
+
+// NewBenchCommand returns a BenchCommand using the
+func newBenchCommand(m *Main) *BenchCommand {
+	return &BenchCommand{
+		Stdin:  m.Stdin,
+		Stdout: m.Stdout,
+		Stderr: m.Stderr,
+	}
+}
+
+// Run executes the "bench" command.
+func (cmd *BenchCommand) Run(args ...string) error {
+	// Parse CLI arguments.
+	options, err := cmd.ParseFlags(args)
+	if err != nil {
+		return err
+	}
+
+	// Remove path if "-work" is not set. Otherwise keep path.
+	if options.Work {
+		fmt.Fprintf(cmd.Stdout, "work: %s\n", options.Path)
+	} else {
+		defer os.Remove(options.Path)
+	}
+
+	// Create database.
+	db, err := bolt.Open(options.Path, 0666, nil)
+	if err != nil {
+		return err
+	}
+	db.NoSync = options.NoSync
+	defer db.Close()
+
+	// Write to the database.
+	var results BenchResults
+	if err := cmd.runWrites(db, options, &results); err != nil {
+		return fmt.Errorf("write: ", err)
+	}
+
+	// Read from the database.
+	if err := cmd.runReads(db, options, &results); err != nil {
+		return fmt.Errorf("bench: read: %s", err)
+	}
+
+	// Print results.
+	fmt.Fprintf(os.Stderr, "# Write\t%v\t(%v/op)\t(%v op/sec)\n", results.WriteDuration, results.WriteOpDuration(), results.WriteOpsPerSecond())
+	fmt.Fprintf(os.Stderr, "# Read\t%v\t(%v/op)\t(%v op/sec)\n", results.ReadDuration, results.ReadOpDuration(), results.ReadOpsPerSecond())
+	fmt.Fprintln(os.Stderr, "")
+	return nil
+}
+
+// ParseFlags parses the command line flags.
+func (cmd *BenchCommand) ParseFlags(args []string) (*BenchOptions, error) {
+	var options BenchOptions
+
+	// Parse flagset.
+	fs := flag.NewFlagSet("", flag.ContinueOnError)
+	fs.StringVar(&options.ProfileMode, "profile-mode", "rw", "")
+	fs.StringVar(&options.WriteMode, "write-mode", "seq", "")
+	fs.StringVar(&options.ReadMode, "read-mode", "seq", "")
+	fs.IntVar(&options.Iterations, "count", 1000, "")
+	fs.IntVar(&options.BatchSize, "batch-size", 0, "")
+	fs.IntVar(&options.KeySize, "key-size", 8, "")
+	fs.IntVar(&options.ValueSize, "value-size", 32, "")
+	fs.StringVar(&options.CPUProfile, "cpuprofile", "", "")
+	fs.StringVar(&options.MemProfile, "memprofile", "", "")
+	fs.StringVar(&options.BlockProfile, "blockprofile", "", "")
+	fs.Float64Var(&options.FillPercent, "fill-percent", bolt.DefaultFillPercent, "")
+	fs.BoolVar(&options.NoSync, "no-sync", false, "")
+	fs.BoolVar(&options.Work, "work", false, "")
+	fs.StringVar(&options.Path, "path", "", "")
+	fs.SetOutput(cmd.Stderr)
+	if err := fs.Parse(args); err != nil {
+		return nil, err
+	}
+
+	// Set batch size to iteration size if not set.
+	// Require that batch size can be evenly divided by the iteration count.
+	if options.BatchSize == 0 {
+		options.BatchSize = options.Iterations
+	} else if options.Iterations%options.BatchSize != 0 {
+		return nil, ErrNonDivisibleBatchSize
+	}
+
+	// Generate temp path if one is not passed in.
+	if options.Path == "" {
+		f, err := ioutil.TempFile("", "bolt-bench-")
+		if err != nil {
+			return nil, fmt.Errorf("temp file: %s", err)
+		}
+		f.Close()
+		os.Remove(f.Name())
+		options.Path = f.Name()
+	}
+
+	return &options, nil
+}
+
+// Writes to the database.
+func (cmd *BenchCommand) runWrites(db *bolt.DB, options *BenchOptions, results *BenchResults) error {
+	// Start profiling for writes.
+	if options.ProfileMode == "rw" || options.ProfileMode == "w" {
+		cmd.startProfiling(options)
+	}
+
+	t := time.Now()
+
+	var err error
+	switch options.WriteMode {
+	case "seq":
+		err = cmd.runWritesSequential(db, options, results)
+	case "rnd":
+		err = cmd.runWritesRandom(db, options, results)
+	case "seq-nest":
+		err = cmd.runWritesSequentialNested(db, options, results)
+	case "rnd-nest":
+		err = cmd.runWritesRandomNested(db, options, results)
+	default:
+		return fmt.Errorf("invalid write mode: %s", options.WriteMode)
+	}
+
+	// Save time to write.
+	results.WriteDuration = time.Since(t)
+
+	// Stop profiling for writes only.
+	if options.ProfileMode == "w" {
+		cmd.stopProfiling()
+	}
+
+	return err
+}
+
+func (cmd *BenchCommand) runWritesSequential(db *bolt.DB, options *BenchOptions, results *BenchResults) error {
+	var i = uint32(0)
+	return cmd.runWritesWithSource(db, options, results, func() uint32 { i++; return i })
+}
+
+func (cmd *BenchCommand) runWritesRandom(db *bolt.DB, options *BenchOptions, results *BenchResults) error {
+	r := rand.New(rand.NewSource(time.Now().UnixNano()))
+	return cmd.runWritesWithSource(db, options, results, func() uint32 { return r.Uint32() })
+}
+
+func (cmd *BenchCommand) runWritesSequentialNested(db *bolt.DB, options *BenchOptions, results *BenchResults) error {
+	var i = uint32(0)
+	return cmd.runWritesWithSource(db, options, results, func() uint32 { i++; return i })
+}
+
+func (cmd *BenchCommand) runWritesRandomNested(db *bolt.DB, options *BenchOptions, results *BenchResults) error {
+	r := rand.New(rand.NewSource(time.Now().UnixNano()))
+	return cmd.runWritesWithSource(db, options, results, func() uint32 { return r.Uint32() })
+}
+
+func (cmd *BenchCommand) runWritesWithSource(db *bolt.DB, options *BenchOptions, results *BenchResults, keySource func() uint32) error {
+	results.WriteOps = options.Iterations
+
+	for i := 0; i < options.Iterations; i += options.BatchSize {
+		if err := db.Update(func(tx *bolt.Tx) error {
+			b, _ := tx.CreateBucketIfNotExists(benchBucketName)
+			b.FillPercent = options.FillPercent
+
+			for j := 0; j < options.BatchSize; j++ {
+				key := make([]byte, options.KeySize)
+				value := make([]byte, options.ValueSize)
+
+				// Write key as uint32.
+				binary.BigEndian.PutUint32(key, keySource())
+
+				// Insert key/value.
+				if err := b.Put(key, value); err != nil {
+					return err
+				}
+			}
+
+			return nil
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (cmd *BenchCommand) runWritesNestedWithSource(db *bolt.DB, options *BenchOptions, results *BenchResults, keySource func() uint32) error {
+	results.WriteOps = options.Iterations
+
+	for i := 0; i < options.Iterations; i += options.BatchSize {
+		if err := db.Update(func(tx *bolt.Tx) error {
+			top, err := tx.CreateBucketIfNotExists(benchBucketName)
+			if err != nil {
+				return err
+			}
+			top.FillPercent = options.FillPercent
+
+			// Create bucket key.
+			name := make([]byte, options.KeySize)
+			binary.BigEndian.PutUint32(name, keySource())
+
+			// Create bucket.
+			b, err := top.CreateBucketIfNotExists(name)
+			if err != nil {
+				return err
+			}
+			b.FillPercent = options.FillPercent
+
+			for j := 0; j < options.BatchSize; j++ {
+				var key = make([]byte, options.KeySize)
+				var value = make([]byte, options.ValueSize)
+
+				// Generate key as uint32.
+				binary.BigEndian.PutUint32(key, keySource())
+
+				// Insert value into subbucket.
+				if err := b.Put(key, value); err != nil {
+					return err
+				}
+			}
+
+			return nil
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Reads from the database.
+func (cmd *BenchCommand) runReads(db *bolt.DB, options *BenchOptions, results *BenchResults) error {
+	// Start profiling for reads.
+	if options.ProfileMode == "r" {
+		cmd.startProfiling(options)
+	}
+
+	t := time.Now()
+
+	var err error
+	switch options.ReadMode {
+	case "seq":
+		switch options.WriteMode {
+		case "seq-nest", "rnd-nest":
+			err = cmd.runReadsSequentialNested(db, options, results)
+		default:
+			err = cmd.runReadsSequential(db, options, results)
+		}
+	default:
+		return fmt.Errorf("invalid read mode: %s", options.ReadMode)
+	}
+
+	// Save read time.
+	results.ReadDuration = time.Since(t)
+
+	// Stop profiling for reads.
+	if options.ProfileMode == "rw" || options.ProfileMode == "r" {
+		cmd.stopProfiling()
+	}
+
+	return err
+}
+
+func (cmd *BenchCommand) runReadsSequential(db *bolt.DB, options *BenchOptions, results *BenchResults) error {
+	return db.View(func(tx *bolt.Tx) error {
+		t := time.Now()
+
+		for {
+			var count int
+
+			c := tx.Bucket(benchBucketName).Cursor()
+			for k, v := c.First(); k != nil; k, v = c.Next() {
+				if v == nil {
+					return errors.New("invalid value")
+				}
+				count++
+			}
+
+			if options.WriteMode == "seq" && count != options.Iterations {
+				return fmt.Errorf("read seq: iter mismatch: expected %d, got %d", options.Iterations, count)
+			}
+
+			results.ReadOps += count
+
+			// Make sure we do this for at least a second.
+			if time.Since(t) >= time.Second {
+				break
+			}
+		}
+
+		return nil
+	})
+}
+
+func (cmd *BenchCommand) runReadsSequentialNested(db *bolt.DB, options *BenchOptions, results *BenchResults) error {
+	return db.View(func(tx *bolt.Tx) error {
+		t := time.Now()
+
+		for {
+			var count int
+			var top = tx.Bucket(benchBucketName)
+			if err := top.ForEach(func(name, _ []byte) error {
+				c := top.Bucket(name).Cursor()
+				for k, v := c.First(); k != nil; k, v = c.Next() {
+					if v == nil {
+						return ErrInvalidValue
+					}
+					count++
+				}
+				return nil
+			}); err != nil {
+				return err
+			}
+
+			if options.WriteMode == "seq-nest" && count != options.Iterations {
+				return fmt.Errorf("read seq-nest: iter mismatch: expected %d, got %d", options.Iterations, count)
+			}
+
+			results.ReadOps += count
+
+			// Make sure we do this for at least a second.
+			if time.Since(t) >= time.Second {
+				break
+			}
+		}
+
+		return nil
+	})
+}
+
+// File handlers for the various profiles.
+var cpuprofile, memprofile, blockprofile *os.File
+
+// Starts all profiles set on the options.
+func (cmd *BenchCommand) startProfiling(options *BenchOptions) {
+	var err error
+
+	// Start CPU profiling.
+	if options.CPUProfile != "" {
+		cpuprofile, err = os.Create(options.CPUProfile)
+		if err != nil {
+			fmt.Fprintf(cmd.Stderr, "bench: could not create cpu profile %q: %v\n", options.CPUProfile, err)
+			os.Exit(1)
+		}
+		pprof.StartCPUProfile(cpuprofile)
+	}
+
+	// Start memory profiling.
+	if options.MemProfile != "" {
+		memprofile, err = os.Create(options.MemProfile)
+		if err != nil {
+			fmt.Fprintf(cmd.Stderr, "bench: could not create memory profile %q: %v\n", options.MemProfile, err)
+			os.Exit(1)
+		}
+		runtime.MemProfileRate = 4096
+	}
+
+	// Start fatal profiling.
+	if options.BlockProfile != "" {
+		blockprofile, err = os.Create(options.BlockProfile)
+		if err != nil {
+			fmt.Fprintf(cmd.Stderr, "bench: could not create block profile %q: %v\n", options.BlockProfile, err)
+			os.Exit(1)
+		}
+		runtime.SetBlockProfileRate(1)
+	}
+}
+
+// Stops all profiles.
+func (cmd *BenchCommand) stopProfiling() {
+	if cpuprofile != nil {
+		pprof.StopCPUProfile()
+		cpuprofile.Close()
+		cpuprofile = nil
+	}
+
+	if memprofile != nil {
+		pprof.Lookup("heap").WriteTo(memprofile, 0)
+		memprofile.Close()
+		memprofile = nil
+	}
+
+	if blockprofile != nil {
+		pprof.Lookup("block").WriteTo(blockprofile, 0)
+		blockprofile.Close()
+		blockprofile = nil
+		runtime.SetBlockProfileRate(0)
+	}
+}
+
+// BenchOptions represents the set of options that can be passed to "bolt bench".
+type BenchOptions struct {
+	ProfileMode   string
+	WriteMode     string
+	ReadMode      string
+	Iterations    int
+	BatchSize     int
+	KeySize       int
+	ValueSize     int
+	CPUProfile    string
+	MemProfile    string
+	BlockProfile  string
+	StatsInterval time.Duration
+	FillPercent   float64
+	NoSync        bool
+	Work          bool
+	Path          string
+}
+
+// BenchResults represents the performance results of the benchmark.
+type BenchResults struct {
+	WriteOps      int
+	WriteDuration time.Duration
+	ReadOps       int
+	ReadDuration  time.Duration
+}
+
+// Returns the duration for a single write operation.
+func (r *BenchResults) WriteOpDuration() time.Duration {
+	if r.WriteOps == 0 {
+		return 0
+	}
+	return r.WriteDuration / time.Duration(r.WriteOps)
+}
+
+// Returns average number of write operations that can be performed per second.
+func (r *BenchResults) WriteOpsPerSecond() int {
+	var op = r.WriteOpDuration()
+	if op == 0 {
+		return 0
+	}
+	return int(time.Second) / int(op)
+}
+
+// Returns the duration for a single read operation.
+func (r *BenchResults) ReadOpDuration() time.Duration {
+	if r.ReadOps == 0 {
+		return 0
+	}
+	return r.ReadDuration / time.Duration(r.ReadOps)
+}
+
+// Returns average number of read operations that can be performed per second.
+func (r *BenchResults) ReadOpsPerSecond() int {
+	var op = r.ReadOpDuration()
+	if op == 0 {
+		return 0
+	}
+	return int(time.Second) / int(op)
+}
+
+type PageError struct {
+	ID  int
+	Err error
+}
+
+func (e *PageError) Error() string {
+	return fmt.Sprintf("page error: id=%d, err=%s", e.ID, e.Err)
+}
+
+// isPrintable returns true if the string is valid unicode and contains only printable runes.
+func isPrintable(s string) bool {
+	if !utf8.ValidString(s) {
+		return false
+	}
+	for _, ch := range s {
+		if !unicode.IsPrint(ch) {
+			return false
+		}
+	}
+	return true
+}
+
+// ReadPage reads page info & full page data from a path.
+// This is not transactionally safe.
+func ReadPage(path string, pageID int) (*page, []byte, error) {
+	// Find page size.
+	pageSize, err := ReadPageSize(path)
+	if err != nil {
+		return nil, nil, fmt.Errorf("read page size: %s", err)
+	}
+
+	// Open database file.
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, nil, err
+	}
+	defer f.Close()
+
+	// Read one block into buffer.
+	buf := make([]byte, pageSize)
+	if n, err := f.ReadAt(buf, int64(pageID*pageSize)); err != nil {
+		return nil, nil, err
+	} else if n != len(buf) {
+		return nil, nil, io.ErrUnexpectedEOF
+	}
+
+	// Determine total number of blocks.
+	p := (*page)(unsafe.Pointer(&buf[0]))
+	overflowN := p.overflow
+
+	// Re-read entire page (with overflow) into buffer.
+	buf = make([]byte, (int(overflowN)+1)*pageSize)
+	if n, err := f.ReadAt(buf, int64(pageID*pageSize)); err != nil {
+		return nil, nil, err
+	} else if n != len(buf) {
+		return nil, nil, io.ErrUnexpectedEOF
+	}
+	p = (*page)(unsafe.Pointer(&buf[0]))
+
+	return p, buf, nil
+}
+
+// ReadPageSize reads page size a path.
+// This is not transactionally safe.
+func ReadPageSize(path string) (int, error) {
+	// Open database file.
+	f, err := os.Open(path)
+	if err != nil {
+		return 0, err
+	}
+	defer f.Close()
+
+	// Read 4KB chunk.
+	buf := make([]byte, 4096)
+	if _, err := io.ReadFull(f, buf); err != nil {
+		return 0, err
+	}
+
+	// Read page size from metadata.
+	m := (*meta)(unsafe.Pointer(&buf[PageHeaderSize]))
+	return int(m.pageSize), nil
+}
+
+// atois parses a slice of strings into integers.
+func atois(strs []string) ([]int, error) {
+	var a []int
+	for _, str := range strs {
+		i, err := strconv.Atoi(str)
+		if err != nil {
+			return nil, err
+		}
+		a = append(a, i)
+	}
+	return a, nil
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+const maxAllocSize = 0xFFFFFFF
+
+// DO NOT EDIT. Copied from the "bolt" package.
+const (
+	branchPageFlag   = 0x01
+	leafPageFlag     = 0x02
+	metaPageFlag     = 0x04
+	freelistPageFlag = 0x10
+)
+
+// DO NOT EDIT. Copied from the "bolt" package.
+const bucketLeafFlag = 0x01
+
+// DO NOT EDIT. Copied from the "bolt" package.
+type pgid uint64
+
+// DO NOT EDIT. Copied from the "bolt" package.
+type txid uint64
+
+// DO NOT EDIT. Copied from the "bolt" package.
+type meta struct {
+	magic    uint32
+	version  uint32
+	pageSize uint32
+	flags    uint32
+	root     bucket
+	freelist pgid
+	pgid     pgid
+	txid     txid
+	checksum uint64
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+type bucket struct {
+	root     pgid
+	sequence uint64
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+type page struct {
+	id       pgid
+	flags    uint16
+	count    uint16
+	overflow uint32
+	ptr      uintptr
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+func (p *page) Type() string {
+	if (p.flags & branchPageFlag) != 0 {
+		return "branch"
+	} else if (p.flags & leafPageFlag) != 0 {
+		return "leaf"
+	} else if (p.flags & metaPageFlag) != 0 {
+		return "meta"
+	} else if (p.flags & freelistPageFlag) != 0 {
+		return "freelist"
+	}
+	return fmt.Sprintf("unknown<%02x>", p.flags)
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+func (p *page) leafPageElement(index uint16) *leafPageElement {
+	n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index]
+	return n
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+func (p *page) branchPageElement(index uint16) *branchPageElement {
+	return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index]
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+type branchPageElement struct {
+	pos   uint32
+	ksize uint32
+	pgid  pgid
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+func (n *branchPageElement) key() []byte {
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
+	return buf[n.pos : n.pos+n.ksize]
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+type leafPageElement struct {
+	flags uint32
+	pos   uint32
+	ksize uint32
+	vsize uint32
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+func (n *leafPageElement) key() []byte {
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
+	return buf[n.pos : n.pos+n.ksize]
+}
+
+// DO NOT EDIT. Copied from the "bolt" package.
+func (n *leafPageElement) value() []byte {
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
+	return buf[n.pos+n.ksize : n.pos+n.ksize+n.vsize]
+}

+ 145 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main_test.go

@@ -0,0 +1,145 @@
+package main_test
+
+import (
+	"bytes"
+	"io/ioutil"
+	"os"
+	"strconv"
+	"testing"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt"
+)
+
+// Ensure the "info" command can print information about a database.
+func TestInfoCommand_Run(t *testing.T) {
+	db := MustOpen(0666, nil)
+	db.DB.Close()
+	defer db.Close()
+
+	// Run the info command.
+	m := NewMain()
+	if err := m.Run("info", db.Path); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// Ensure the "stats" command can execute correctly.
+func TestStatsCommand_Run(t *testing.T) {
+	// Ignore
+	if os.Getpagesize() != 4096 {
+		t.Skip("system does not use 4KB page size")
+	}
+
+	db := MustOpen(0666, nil)
+	defer db.Close()
+
+	if err := db.Update(func(tx *bolt.Tx) error {
+		// Create "foo" bucket.
+		b, err := tx.CreateBucket([]byte("foo"))
+		if err != nil {
+			return err
+		}
+		for i := 0; i < 10; i++ {
+			if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil {
+				return err
+			}
+		}
+
+		// Create "bar" bucket.
+		b, err = tx.CreateBucket([]byte("bar"))
+		if err != nil {
+			return err
+		}
+		for i := 0; i < 100; i++ {
+			if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil {
+				return err
+			}
+		}
+
+		// Create "baz" bucket.
+		b, err = tx.CreateBucket([]byte("baz"))
+		if err != nil {
+			return err
+		}
+		if err := b.Put([]byte("key"), []byte("value")); err != nil {
+			return err
+		}
+
+		return nil
+	}); err != nil {
+		t.Fatal(err)
+	}
+	db.DB.Close()
+
+	// Generate expected result.
+	exp := "Aggregate statistics for 3 buckets\n\n" +
+		"Page count statistics\n" +
+		"\tNumber of logical branch pages: 0\n" +
+		"\tNumber of physical branch overflow pages: 0\n" +
+		"\tNumber of logical leaf pages: 1\n" +
+		"\tNumber of physical leaf overflow pages: 0\n" +
+		"Tree statistics\n" +
+		"\tNumber of keys/value pairs: 111\n" +
+		"\tNumber of levels in B+tree: 1\n" +
+		"Page size utilization\n" +
+		"\tBytes allocated for physical branch pages: 0\n" +
+		"\tBytes actually used for branch data: 0 (0%)\n" +
+		"\tBytes allocated for physical leaf pages: 4096\n" +
+		"\tBytes actually used for leaf data: 1996 (48%)\n" +
+		"Bucket statistics\n" +
+		"\tTotal number of buckets: 3\n" +
+		"\tTotal number on inlined buckets: 2 (66%)\n" +
+		"\tBytes used for inlined buckets: 236 (11%)\n"
+
+	// Run the command.
+	m := NewMain()
+	if err := m.Run("stats", db.Path); err != nil {
+		t.Fatal(err)
+	} else if m.Stdout.String() != exp {
+		t.Fatalf("unexpected stdout:\n\n%s", m.Stdout.String())
+	}
+}
+
+// Main represents a test wrapper for main.Main that records output.
+type Main struct {
+	*main.Main
+	Stdin  bytes.Buffer
+	Stdout bytes.Buffer
+	Stderr bytes.Buffer
+}
+
+// NewMain returns a new instance of Main.
+func NewMain() *Main {
+	m := &Main{Main: main.NewMain()}
+	m.Main.Stdin = &m.Stdin
+	m.Main.Stdout = &m.Stdout
+	m.Main.Stderr = &m.Stderr
+	return m
+}
+
+// MustOpen creates a Bolt database in a temporary location.
+func MustOpen(mode os.FileMode, options *bolt.Options) *DB {
+	// Create temporary path.
+	f, _ := ioutil.TempFile("", "bolt-")
+	f.Close()
+	os.Remove(f.Name())
+
+	db, err := bolt.Open(f.Name(), mode, options)
+	if err != nil {
+		panic(err.Error())
+	}
+	return &DB{DB: db, Path: f.Name()}
+}
+
+// DB is a test wrapper for bolt.DB.
+type DB struct {
+	*bolt.DB
+	Path string
+}
+
+// Close closes and removes the database.
+func (db *DB) Close() error {
+	defer os.Remove(db.Path)
+	return db.DB.Close()
+}

BIN
Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/p.out


+ 384 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/cursor.go

@@ -0,0 +1,384 @@
+package bolt
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+)
+
+// Cursor represents an iterator that can traverse over all key/value pairs in a bucket in sorted order.
+// Cursors see nested buckets with value == nil.
+// Cursors can be obtained from a transaction and are valid as long as the transaction is open.
+//
+// Keys and values returned from the cursor are only valid for the life of the transaction.
+//
+// Changing data while traversing with a cursor may cause it to be invalidated
+// and return unexpected keys and/or values. You must reposition your cursor
+// after mutating data.
+type Cursor struct {
+	bucket *Bucket
+	stack  []elemRef
+}
+
+// Bucket returns the bucket that this cursor was created from.
+func (c *Cursor) Bucket() *Bucket {
+	return c.bucket
+}
+
+// First moves the cursor to the first item in the bucket and returns its key and value.
+// If the bucket is empty then a nil key and value are returned.
+// The returned key and value are only valid for the life of the transaction.
+func (c *Cursor) First() (key []byte, value []byte) {
+	_assert(c.bucket.tx.db != nil, "tx closed")
+	c.stack = c.stack[:0]
+	p, n := c.bucket.pageNode(c.bucket.root)
+	c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
+	c.first()
+	k, v, flags := c.keyValue()
+	if (flags & uint32(bucketLeafFlag)) != 0 {
+		return k, nil
+	}
+	return k, v
+
+}
+
+// Last moves the cursor to the last item in the bucket and returns its key and value.
+// If the bucket is empty then a nil key and value are returned.
+// The returned key and value are only valid for the life of the transaction.
+func (c *Cursor) Last() (key []byte, value []byte) {
+	_assert(c.bucket.tx.db != nil, "tx closed")
+	c.stack = c.stack[:0]
+	p, n := c.bucket.pageNode(c.bucket.root)
+	ref := elemRef{page: p, node: n}
+	ref.index = ref.count() - 1
+	c.stack = append(c.stack, ref)
+	c.last()
+	k, v, flags := c.keyValue()
+	if (flags & uint32(bucketLeafFlag)) != 0 {
+		return k, nil
+	}
+	return k, v
+}
+
+// Next moves the cursor to the next item in the bucket and returns its key and value.
+// If the cursor is at the end of the bucket then a nil key and value are returned.
+// The returned key and value are only valid for the life of the transaction.
+func (c *Cursor) Next() (key []byte, value []byte) {
+	_assert(c.bucket.tx.db != nil, "tx closed")
+	k, v, flags := c.next()
+	if (flags & uint32(bucketLeafFlag)) != 0 {
+		return k, nil
+	}
+	return k, v
+}
+
+// Prev moves the cursor to the previous item in the bucket and returns its key and value.
+// If the cursor is at the beginning of the bucket then a nil key and value are returned.
+// The returned key and value are only valid for the life of the transaction.
+func (c *Cursor) Prev() (key []byte, value []byte) {
+	_assert(c.bucket.tx.db != nil, "tx closed")
+
+	// Attempt to move back one element until we're successful.
+	// Move up the stack as we hit the beginning of each page in our stack.
+	for i := len(c.stack) - 1; i >= 0; i-- {
+		elem := &c.stack[i]
+		if elem.index > 0 {
+			elem.index--
+			break
+		}
+		c.stack = c.stack[:i]
+	}
+
+	// If we've hit the end then return nil.
+	if len(c.stack) == 0 {
+		return nil, nil
+	}
+
+	// Move down the stack to find the last element of the last leaf under this branch.
+	c.last()
+	k, v, flags := c.keyValue()
+	if (flags & uint32(bucketLeafFlag)) != 0 {
+		return k, nil
+	}
+	return k, v
+}
+
+// Seek moves the cursor to a given key and returns it.
+// If the key does not exist then the next key is used. If no keys
+// follow, a nil key is returned.
+// The returned key and value are only valid for the life of the transaction.
+func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) {
+	k, v, flags := c.seek(seek)
+
+	// If we ended up after the last element of a page then move to the next one.
+	if ref := &c.stack[len(c.stack)-1]; ref.index >= ref.count() {
+		k, v, flags = c.next()
+	}
+
+	if k == nil {
+		return nil, nil
+	} else if (flags & uint32(bucketLeafFlag)) != 0 {
+		return k, nil
+	}
+	return k, v
+}
+
+// Delete removes the current key/value under the cursor from the bucket.
+// Delete fails if current key/value is a bucket or if the transaction is not writable.
+func (c *Cursor) Delete() error {
+	if c.bucket.tx.db == nil {
+		return ErrTxClosed
+	} else if !c.bucket.Writable() {
+		return ErrTxNotWritable
+	}
+
+	key, _, flags := c.keyValue()
+	// Return an error if current value is a bucket.
+	if (flags & bucketLeafFlag) != 0 {
+		return ErrIncompatibleValue
+	}
+	c.node().del(key)
+
+	return nil
+}
+
+// seek moves the cursor to a given key and returns it.
+// If the key does not exist then the next key is used.
+func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) {
+	_assert(c.bucket.tx.db != nil, "tx closed")
+
+	// Start from root page/node and traverse to correct page.
+	c.stack = c.stack[:0]
+	c.search(seek, c.bucket.root)
+	ref := &c.stack[len(c.stack)-1]
+
+	// If the cursor is pointing to the end of page/node then return nil.
+	if ref.index >= ref.count() {
+		return nil, nil, 0
+	}
+
+	// If this is a bucket then return a nil value.
+	return c.keyValue()
+}
+
+// first moves the cursor to the first leaf element under the last page in the stack.
+func (c *Cursor) first() {
+	for {
+		// Exit when we hit a leaf page.
+		var ref = &c.stack[len(c.stack)-1]
+		if ref.isLeaf() {
+			break
+		}
+
+		// Keep adding pages pointing to the first element to the stack.
+		var pgid pgid
+		if ref.node != nil {
+			pgid = ref.node.inodes[ref.index].pgid
+		} else {
+			pgid = ref.page.branchPageElement(uint16(ref.index)).pgid
+		}
+		p, n := c.bucket.pageNode(pgid)
+		c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
+	}
+}
+
+// last moves the cursor to the last leaf element under the last page in the stack.
+func (c *Cursor) last() {
+	for {
+		// Exit when we hit a leaf page.
+		ref := &c.stack[len(c.stack)-1]
+		if ref.isLeaf() {
+			break
+		}
+
+		// Keep adding pages pointing to the last element in the stack.
+		var pgid pgid
+		if ref.node != nil {
+			pgid = ref.node.inodes[ref.index].pgid
+		} else {
+			pgid = ref.page.branchPageElement(uint16(ref.index)).pgid
+		}
+		p, n := c.bucket.pageNode(pgid)
+
+		var nextRef = elemRef{page: p, node: n}
+		nextRef.index = nextRef.count() - 1
+		c.stack = append(c.stack, nextRef)
+	}
+}
+
+// next moves to the next leaf element and returns the key and value.
+// If the cursor is at the last leaf element then it stays there and returns nil.
+func (c *Cursor) next() (key []byte, value []byte, flags uint32) {
+	// Attempt to move over one element until we're successful.
+	// Move up the stack as we hit the end of each page in our stack.
+	var i int
+	for i = len(c.stack) - 1; i >= 0; i-- {
+		elem := &c.stack[i]
+		if elem.index < elem.count()-1 {
+			elem.index++
+			break
+		}
+	}
+
+	// If we've hit the root page then stop and return. This will leave the
+	// cursor on the last element of the last page.
+	if i == -1 {
+		return nil, nil, 0
+	}
+
+	// Otherwise start from where we left off in the stack and find the
+	// first element of the first leaf page.
+	c.stack = c.stack[:i+1]
+	c.first()
+	return c.keyValue()
+}
+
+// search recursively performs a binary search against a given page/node until it finds a given key.
+func (c *Cursor) search(key []byte, pgid pgid) {
+	p, n := c.bucket.pageNode(pgid)
+	if p != nil && (p.flags&(branchPageFlag|leafPageFlag)) == 0 {
+		panic(fmt.Sprintf("invalid page type: %d: %x", p.id, p.flags))
+	}
+	e := elemRef{page: p, node: n}
+	c.stack = append(c.stack, e)
+
+	// If we're on a leaf page/node then find the specific node.
+	if e.isLeaf() {
+		c.nsearch(key)
+		return
+	}
+
+	if n != nil {
+		c.searchNode(key, n)
+		return
+	}
+	c.searchPage(key, p)
+}
+
+func (c *Cursor) searchNode(key []byte, n *node) {
+	var exact bool
+	index := sort.Search(len(n.inodes), func(i int) bool {
+		// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
+		// sort.Search() finds the lowest index where f() != -1 but we need the highest index.
+		ret := bytes.Compare(n.inodes[i].key, key)
+		if ret == 0 {
+			exact = true
+		}
+		return ret != -1
+	})
+	if !exact && index > 0 {
+		index--
+	}
+	c.stack[len(c.stack)-1].index = index
+
+	// Recursively search to the next page.
+	c.search(key, n.inodes[index].pgid)
+}
+
+func (c *Cursor) searchPage(key []byte, p *page) {
+	// Binary search for the correct range.
+	inodes := p.branchPageElements()
+
+	var exact bool
+	index := sort.Search(int(p.count), func(i int) bool {
+		// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
+		// sort.Search() finds the lowest index where f() != -1 but we need the highest index.
+		ret := bytes.Compare(inodes[i].key(), key)
+		if ret == 0 {
+			exact = true
+		}
+		return ret != -1
+	})
+	if !exact && index > 0 {
+		index--
+	}
+	c.stack[len(c.stack)-1].index = index
+
+	// Recursively search to the next page.
+	c.search(key, inodes[index].pgid)
+}
+
+// nsearch searches the leaf node on the top of the stack for a key.
+func (c *Cursor) nsearch(key []byte) {
+	e := &c.stack[len(c.stack)-1]
+	p, n := e.page, e.node
+
+	// If we have a node then search its inodes.
+	if n != nil {
+		index := sort.Search(len(n.inodes), func(i int) bool {
+			return bytes.Compare(n.inodes[i].key, key) != -1
+		})
+		e.index = index
+		return
+	}
+
+	// If we have a page then search its leaf elements.
+	inodes := p.leafPageElements()
+	index := sort.Search(int(p.count), func(i int) bool {
+		return bytes.Compare(inodes[i].key(), key) != -1
+	})
+	e.index = index
+}
+
+// keyValue returns the key and value of the current leaf element.
+func (c *Cursor) keyValue() ([]byte, []byte, uint32) {
+	ref := &c.stack[len(c.stack)-1]
+	if ref.count() == 0 || ref.index >= ref.count() {
+		return nil, nil, 0
+	}
+
+	// Retrieve value from node.
+	if ref.node != nil {
+		inode := &ref.node.inodes[ref.index]
+		return inode.key, inode.value, inode.flags
+	}
+
+	// Or retrieve value from page.
+	elem := ref.page.leafPageElement(uint16(ref.index))
+	return elem.key(), elem.value(), elem.flags
+}
+
+// node returns the node that the cursor is currently positioned on.
+func (c *Cursor) node() *node {
+	_assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack")
+
+	// If the top of the stack is a leaf node then just return it.
+	if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() {
+		return ref.node
+	}
+
+	// Start from root and traverse down the hierarchy.
+	var n = c.stack[0].node
+	if n == nil {
+		n = c.bucket.node(c.stack[0].page.id, nil)
+	}
+	for _, ref := range c.stack[:len(c.stack)-1] {
+		_assert(!n.isLeaf, "expected branch node")
+		n = n.childAt(int(ref.index))
+	}
+	_assert(n.isLeaf, "expected leaf node")
+	return n
+}
+
+// elemRef represents a reference to an element on a given page/node.
+type elemRef struct {
+	page  *page
+	node  *node
+	index int
+}
+
+// isLeaf returns whether the ref is pointing at a leaf page/node.
+func (r *elemRef) isLeaf() bool {
+	if r.node != nil {
+		return r.node.isLeaf
+	}
+	return (r.page.flags & leafPageFlag) != 0
+}
+
+// count returns the number of inodes or page elements.
+func (r *elemRef) count() int {
+	if r.node != nil {
+		return len(r.node.inodes)
+	}
+	return int(r.page.count)
+}

+ 511 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/cursor_test.go

@@ -0,0 +1,511 @@
+package bolt_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"os"
+	"sort"
+	"testing"
+	"testing/quick"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+// Ensure that a cursor can return a reference to the bucket that created it.
+func TestCursor_Bucket(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		b, _ := tx.CreateBucket([]byte("widgets"))
+		c := b.Cursor()
+		equals(t, b, c.Bucket())
+		return nil
+	})
+}
+
+// Ensure that a Tx cursor can seek to the appropriate keys.
+func TestCursor_Seek(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("widgets"))
+		ok(t, err)
+		ok(t, b.Put([]byte("foo"), []byte("0001")))
+		ok(t, b.Put([]byte("bar"), []byte("0002")))
+		ok(t, b.Put([]byte("baz"), []byte("0003")))
+		_, err = b.CreateBucket([]byte("bkt"))
+		ok(t, err)
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		c := tx.Bucket([]byte("widgets")).Cursor()
+
+		// Exact match should go to the key.
+		k, v := c.Seek([]byte("bar"))
+		equals(t, []byte("bar"), k)
+		equals(t, []byte("0002"), v)
+
+		// Inexact match should go to the next key.
+		k, v = c.Seek([]byte("bas"))
+		equals(t, []byte("baz"), k)
+		equals(t, []byte("0003"), v)
+
+		// Low key should go to the first key.
+		k, v = c.Seek([]byte(""))
+		equals(t, []byte("bar"), k)
+		equals(t, []byte("0002"), v)
+
+		// High key should return no key.
+		k, v = c.Seek([]byte("zzz"))
+		assert(t, k == nil, "")
+		assert(t, v == nil, "")
+
+		// Buckets should return their key but no value.
+		k, v = c.Seek([]byte("bkt"))
+		equals(t, []byte("bkt"), k)
+		assert(t, v == nil, "")
+
+		return nil
+	})
+}
+
+func TestCursor_Delete(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	var count = 1000
+
+	// Insert every other key between 0 and $count.
+	db.Update(func(tx *bolt.Tx) error {
+		b, _ := tx.CreateBucket([]byte("widgets"))
+		for i := 0; i < count; i += 1 {
+			k := make([]byte, 8)
+			binary.BigEndian.PutUint64(k, uint64(i))
+			b.Put(k, make([]byte, 100))
+		}
+		b.CreateBucket([]byte("sub"))
+		return nil
+	})
+
+	db.Update(func(tx *bolt.Tx) error {
+		c := tx.Bucket([]byte("widgets")).Cursor()
+		bound := make([]byte, 8)
+		binary.BigEndian.PutUint64(bound, uint64(count/2))
+		for key, _ := c.First(); bytes.Compare(key, bound) < 0; key, _ = c.Next() {
+			if err := c.Delete(); err != nil {
+				return err
+			}
+		}
+		c.Seek([]byte("sub"))
+		err := c.Delete()
+		equals(t, err, bolt.ErrIncompatibleValue)
+		return nil
+	})
+
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		equals(t, b.Stats().KeyN, count/2+1)
+		return nil
+	})
+}
+
+// Ensure that a Tx cursor can seek to the appropriate keys when there are a
+// large number of keys. This test also checks that seek will always move
+// forward to the next key.
+//
+// Related: https://github.com/boltdb/bolt/pull/187
+func TestCursor_Seek_Large(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	var count = 10000
+
+	// Insert every other key between 0 and $count.
+	db.Update(func(tx *bolt.Tx) error {
+		b, _ := tx.CreateBucket([]byte("widgets"))
+		for i := 0; i < count; i += 100 {
+			for j := i; j < i+100; j += 2 {
+				k := make([]byte, 8)
+				binary.BigEndian.PutUint64(k, uint64(j))
+				b.Put(k, make([]byte, 100))
+			}
+		}
+		return nil
+	})
+
+	db.View(func(tx *bolt.Tx) error {
+		c := tx.Bucket([]byte("widgets")).Cursor()
+		for i := 0; i < count; i++ {
+			seek := make([]byte, 8)
+			binary.BigEndian.PutUint64(seek, uint64(i))
+
+			k, _ := c.Seek(seek)
+
+			// The last seek is beyond the end of the the range so
+			// it should return nil.
+			if i == count-1 {
+				assert(t, k == nil, "")
+				continue
+			}
+
+			// Otherwise we should seek to the exact key or the next key.
+			num := binary.BigEndian.Uint64(k)
+			if i%2 == 0 {
+				equals(t, uint64(i), num)
+			} else {
+				equals(t, uint64(i+1), num)
+			}
+		}
+
+		return nil
+	})
+}
+
+// Ensure that a cursor can iterate over an empty bucket without error.
+func TestCursor_EmptyBucket(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		return err
+	})
+	db.View(func(tx *bolt.Tx) error {
+		c := tx.Bucket([]byte("widgets")).Cursor()
+		k, v := c.First()
+		assert(t, k == nil, "")
+		assert(t, v == nil, "")
+		return nil
+	})
+}
+
+// Ensure that a Tx cursor can reverse iterate over an empty bucket without error.
+func TestCursor_EmptyBucketReverse(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		return err
+	})
+	db.View(func(tx *bolt.Tx) error {
+		c := tx.Bucket([]byte("widgets")).Cursor()
+		k, v := c.Last()
+		assert(t, k == nil, "")
+		assert(t, v == nil, "")
+		return nil
+	})
+}
+
+// Ensure that a Tx cursor can iterate over a single root with a couple elements.
+func TestCursor_Iterate_Leaf(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte{})
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte{0})
+		tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte{1})
+		return nil
+	})
+	tx, _ := db.Begin(false)
+	c := tx.Bucket([]byte("widgets")).Cursor()
+
+	k, v := c.First()
+	equals(t, string(k), "bar")
+	equals(t, v, []byte{1})
+
+	k, v = c.Next()
+	equals(t, string(k), "baz")
+	equals(t, v, []byte{})
+
+	k, v = c.Next()
+	equals(t, string(k), "foo")
+	equals(t, v, []byte{0})
+
+	k, v = c.Next()
+	assert(t, k == nil, "")
+	assert(t, v == nil, "")
+
+	k, v = c.Next()
+	assert(t, k == nil, "")
+	assert(t, v == nil, "")
+
+	tx.Rollback()
+}
+
+// Ensure that a Tx cursor can iterate in reverse over a single root with a couple elements.
+func TestCursor_LeafRootReverse(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte{})
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte{0})
+		tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte{1})
+		return nil
+	})
+	tx, _ := db.Begin(false)
+	c := tx.Bucket([]byte("widgets")).Cursor()
+
+	k, v := c.Last()
+	equals(t, string(k), "foo")
+	equals(t, v, []byte{0})
+
+	k, v = c.Prev()
+	equals(t, string(k), "baz")
+	equals(t, v, []byte{})
+
+	k, v = c.Prev()
+	equals(t, string(k), "bar")
+	equals(t, v, []byte{1})
+
+	k, v = c.Prev()
+	assert(t, k == nil, "")
+	assert(t, v == nil, "")
+
+	k, v = c.Prev()
+	assert(t, k == nil, "")
+	assert(t, v == nil, "")
+
+	tx.Rollback()
+}
+
+// Ensure that a Tx cursor can restart from the beginning.
+func TestCursor_Restart(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte{})
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte{})
+		return nil
+	})
+
+	tx, _ := db.Begin(false)
+	c := tx.Bucket([]byte("widgets")).Cursor()
+
+	k, _ := c.First()
+	equals(t, string(k), "bar")
+
+	k, _ = c.Next()
+	equals(t, string(k), "foo")
+
+	k, _ = c.First()
+	equals(t, string(k), "bar")
+
+	k, _ = c.Next()
+	equals(t, string(k), "foo")
+
+	tx.Rollback()
+}
+
+// Ensure that a Tx can iterate over all elements in a bucket.
+func TestCursor_QuickCheck(t *testing.T) {
+	f := func(items testdata) bool {
+		db := NewTestDB()
+		defer db.Close()
+
+		// Bulk insert all values.
+		tx, _ := db.Begin(true)
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+		for _, item := range items {
+			ok(t, b.Put(item.Key, item.Value))
+		}
+		ok(t, tx.Commit())
+
+		// Sort test data.
+		sort.Sort(items)
+
+		// Iterate over all items and check consistency.
+		var index = 0
+		tx, _ = db.Begin(false)
+		c := tx.Bucket([]byte("widgets")).Cursor()
+		for k, v := c.First(); k != nil && index < len(items); k, v = c.Next() {
+			equals(t, k, items[index].Key)
+			equals(t, v, items[index].Value)
+			index++
+		}
+		equals(t, len(items), index)
+		tx.Rollback()
+
+		return true
+	}
+	if err := quick.Check(f, qconfig()); err != nil {
+		t.Error(err)
+	}
+}
+
+// Ensure that a transaction can iterate over all elements in a bucket in reverse.
+func TestCursor_QuickCheck_Reverse(t *testing.T) {
+	f := func(items testdata) bool {
+		db := NewTestDB()
+		defer db.Close()
+
+		// Bulk insert all values.
+		tx, _ := db.Begin(true)
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+		for _, item := range items {
+			ok(t, b.Put(item.Key, item.Value))
+		}
+		ok(t, tx.Commit())
+
+		// Sort test data.
+		sort.Sort(revtestdata(items))
+
+		// Iterate over all items and check consistency.
+		var index = 0
+		tx, _ = db.Begin(false)
+		c := tx.Bucket([]byte("widgets")).Cursor()
+		for k, v := c.Last(); k != nil && index < len(items); k, v = c.Prev() {
+			equals(t, k, items[index].Key)
+			equals(t, v, items[index].Value)
+			index++
+		}
+		equals(t, len(items), index)
+		tx.Rollback()
+
+		return true
+	}
+	if err := quick.Check(f, qconfig()); err != nil {
+		t.Error(err)
+	}
+}
+
+// Ensure that a Tx cursor can iterate over subbuckets.
+func TestCursor_QuickCheck_BucketsOnly(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("widgets"))
+		ok(t, err)
+		_, err = b.CreateBucket([]byte("foo"))
+		ok(t, err)
+		_, err = b.CreateBucket([]byte("bar"))
+		ok(t, err)
+		_, err = b.CreateBucket([]byte("baz"))
+		ok(t, err)
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		var names []string
+		c := tx.Bucket([]byte("widgets")).Cursor()
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			names = append(names, string(k))
+			assert(t, v == nil, "")
+		}
+		equals(t, names, []string{"bar", "baz", "foo"})
+		return nil
+	})
+}
+
+// Ensure that a Tx cursor can reverse iterate over subbuckets.
+func TestCursor_QuickCheck_BucketsOnly_Reverse(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("widgets"))
+		ok(t, err)
+		_, err = b.CreateBucket([]byte("foo"))
+		ok(t, err)
+		_, err = b.CreateBucket([]byte("bar"))
+		ok(t, err)
+		_, err = b.CreateBucket([]byte("baz"))
+		ok(t, err)
+		return nil
+	})
+	db.View(func(tx *bolt.Tx) error {
+		var names []string
+		c := tx.Bucket([]byte("widgets")).Cursor()
+		for k, v := c.Last(); k != nil; k, v = c.Prev() {
+			names = append(names, string(k))
+			assert(t, v == nil, "")
+		}
+		equals(t, names, []string{"foo", "baz", "bar"})
+		return nil
+	})
+}
+
+func ExampleCursor() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Start a read-write transaction.
+	db.Update(func(tx *bolt.Tx) error {
+		// Create a new bucket.
+		tx.CreateBucket([]byte("animals"))
+
+		// Insert data into a bucket.
+		b := tx.Bucket([]byte("animals"))
+		b.Put([]byte("dog"), []byte("fun"))
+		b.Put([]byte("cat"), []byte("lame"))
+		b.Put([]byte("liger"), []byte("awesome"))
+
+		// Create a cursor for iteration.
+		c := b.Cursor()
+
+		// Iterate over items in sorted key order. This starts from the
+		// first key/value pair and updates the k/v variables to the
+		// next key/value on each iteration.
+		//
+		// The loop finishes at the end of the cursor when a nil key is returned.
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			fmt.Printf("A %s is %s.\n", k, v)
+		}
+
+		return nil
+	})
+
+	// Output:
+	// A cat is lame.
+	// A dog is fun.
+	// A liger is awesome.
+}
+
+func ExampleCursor_reverse() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Start a read-write transaction.
+	db.Update(func(tx *bolt.Tx) error {
+		// Create a new bucket.
+		tx.CreateBucket([]byte("animals"))
+
+		// Insert data into a bucket.
+		b := tx.Bucket([]byte("animals"))
+		b.Put([]byte("dog"), []byte("fun"))
+		b.Put([]byte("cat"), []byte("lame"))
+		b.Put([]byte("liger"), []byte("awesome"))
+
+		// Create a cursor for iteration.
+		c := b.Cursor()
+
+		// Iterate over items in reverse sorted key order. This starts
+		// from the last key/value pair and updates the k/v variables to
+		// the previous key/value on each iteration.
+		//
+		// The loop finishes at the beginning of the cursor when a nil key
+		// is returned.
+		for k, v := c.Last(); k != nil; k, v = c.Prev() {
+			fmt.Printf("A %s is %s.\n", k, v)
+		}
+
+		return nil
+	})
+
+	// Output:
+	// A liger is awesome.
+	// A dog is fun.
+	// A cat is lame.
+}

+ 732 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/db.go

@@ -0,0 +1,732 @@
+package bolt
+
+import (
+	"fmt"
+	"hash/fnv"
+	"os"
+	"runtime"
+	"runtime/debug"
+	"strings"
+	"sync"
+	"time"
+	"unsafe"
+)
+
+// The largest step that can be taken when remapping the mmap.
+const maxMmapStep = 1 << 30 // 1GB
+
+// The data file format version.
+const version = 2
+
+// Represents a marker value to indicate that a file is a Bolt DB.
+const magic uint32 = 0xED0CDAED
+
+// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
+// syncing changes to a file.  This is required as some operating systems,
+// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
+// must be synchronzied using the msync(2) syscall.
+const IgnoreNoSync = runtime.GOOS == "openbsd"
+
+// Default values if not set in a DB instance.
+const (
+	DefaultMaxBatchSize  int = 1000
+	DefaultMaxBatchDelay     = 10 * time.Millisecond
+)
+
+// DB represents a collection of buckets persisted to a file on disk.
+// All data access is performed through transactions which can be obtained through the DB.
+// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
+type DB struct {
+	// When enabled, the database will perform a Check() after every commit.
+	// A panic is issued if the database is in an inconsistent state. This
+	// flag has a large performance impact so it should only be used for
+	// debugging purposes.
+	StrictMode bool
+
+	// Setting the NoSync flag will cause the database to skip fsync()
+	// calls after each commit. This can be useful when bulk loading data
+	// into a database and you can restart the bulk load in the event of
+	// a system failure or database corruption. Do not set this flag for
+	// normal use.
+	//
+	// If the package global IgnoreNoSync constant is true, this value is
+	// ignored.  See the comment on that constant for more details.
+	//
+	// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
+	NoSync bool
+
+	// MaxBatchSize is the maximum size of a batch. Default value is
+	// copied from DefaultMaxBatchSize in Open.
+	//
+	// If <=0, disables batching.
+	//
+	// Do not change concurrently with calls to Batch.
+	MaxBatchSize int
+
+	// MaxBatchDelay is the maximum delay before a batch starts.
+	// Default value is copied from DefaultMaxBatchDelay in Open.
+	//
+	// If <=0, effectively disables batching.
+	//
+	// Do not change concurrently with calls to Batch.
+	MaxBatchDelay time.Duration
+
+	path     string
+	file     *os.File
+	dataref  []byte // mmap'ed readonly, write throws SEGV
+	data     *[maxMapSize]byte
+	datasz   int
+	meta0    *meta
+	meta1    *meta
+	pageSize int
+	opened   bool
+	rwtx     *Tx
+	txs      []*Tx
+	freelist *freelist
+	stats    Stats
+
+	batchMu sync.Mutex
+	batch   *batch
+
+	rwlock   sync.Mutex   // Allows only one writer at a time.
+	metalock sync.Mutex   // Protects meta page access.
+	mmaplock sync.RWMutex // Protects mmap access during remapping.
+	statlock sync.RWMutex // Protects stats access.
+
+	ops struct {
+		writeAt func(b []byte, off int64) (n int, err error)
+	}
+}
+
+// Path returns the path to currently open database file.
+func (db *DB) Path() string {
+	return db.path
+}
+
+// GoString returns the Go string representation of the database.
+func (db *DB) GoString() string {
+	return fmt.Sprintf("bolt.DB{path:%q}", db.path)
+}
+
+// String returns the string representation of the database.
+func (db *DB) String() string {
+	return fmt.Sprintf("DB<%q>", db.path)
+}
+
+// Open creates and opens a database at the given path.
+// If the file does not exist then it will be created automatically.
+// Passing in nil options will cause Bolt to open the database with the default options.
+func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
+	var db = &DB{opened: true}
+
+	// Set default options if no options are provided.
+	if options == nil {
+		options = DefaultOptions
+	}
+
+	// Set default values for later DB operations.
+	db.MaxBatchSize = DefaultMaxBatchSize
+	db.MaxBatchDelay = DefaultMaxBatchDelay
+
+	// Open data file and separate sync handler for metadata writes.
+	db.path = path
+
+	var err error
+	if db.file, err = os.OpenFile(db.path, os.O_RDWR|os.O_CREATE, mode); err != nil {
+		_ = db.close()
+		return nil, err
+	}
+
+	// Lock file so that other processes using Bolt cannot use the database
+	// at the same time. This would cause corruption since the two processes
+	// would write meta pages and free pages separately.
+	if err := flock(db.file, options.Timeout); err != nil {
+		_ = db.close()
+		return nil, err
+	}
+
+	// Default values for test hooks
+	db.ops.writeAt = db.file.WriteAt
+
+	// Initialize the database if it doesn't exist.
+	if info, err := db.file.Stat(); err != nil {
+		return nil, fmt.Errorf("stat error: %s", err)
+	} else if info.Size() == 0 {
+		// Initialize new files with meta pages.
+		if err := db.init(); err != nil {
+			return nil, err
+		}
+	} else {
+		// Read the first meta page to determine the page size.
+		var buf [0x1000]byte
+		if _, err := db.file.ReadAt(buf[:], 0); err == nil {
+			m := db.pageInBuffer(buf[:], 0).meta()
+			if err := m.validate(); err != nil {
+				return nil, fmt.Errorf("meta0 error: %s", err)
+			}
+			db.pageSize = int(m.pageSize)
+		}
+	}
+
+	// Memory map the data file.
+	if err := db.mmap(0); err != nil {
+		_ = db.close()
+		return nil, err
+	}
+
+	// Read in the freelist.
+	db.freelist = newFreelist()
+	db.freelist.read(db.page(db.meta().freelist))
+
+	// Mark the database as opened and return.
+	return db, nil
+}
+
+// mmap opens the underlying memory-mapped file and initializes the meta references.
+// minsz is the minimum size that the new mmap can be.
+func (db *DB) mmap(minsz int) error {
+	db.mmaplock.Lock()
+	defer db.mmaplock.Unlock()
+
+	info, err := db.file.Stat()
+	if err != nil {
+		return fmt.Errorf("mmap stat error: %s", err)
+	} else if int(info.Size()) < db.pageSize*2 {
+		return fmt.Errorf("file size too small")
+	}
+
+	// Ensure the size is at least the minimum size.
+	var size = int(info.Size())
+	if size < minsz {
+		size = minsz
+	}
+	size, err = db.mmapSize(size)
+	if err != nil {
+		return err
+	}
+
+	// Dereference all mmap references before unmapping.
+	if db.rwtx != nil {
+		db.rwtx.root.dereference()
+	}
+
+	// Unmap existing data before continuing.
+	if err := db.munmap(); err != nil {
+		return err
+	}
+
+	// Memory-map the data file as a byte slice.
+	if err := mmap(db, size); err != nil {
+		return err
+	}
+
+	// Save references to the meta pages.
+	db.meta0 = db.page(0).meta()
+	db.meta1 = db.page(1).meta()
+
+	// Validate the meta pages.
+	if err := db.meta0.validate(); err != nil {
+		return fmt.Errorf("meta0 error: %s", err)
+	}
+	if err := db.meta1.validate(); err != nil {
+		return fmt.Errorf("meta1 error: %s", err)
+	}
+
+	return nil
+}
+
+// munmap unmaps the data file from memory.
+func (db *DB) munmap() error {
+	if err := munmap(db); err != nil {
+		return fmt.Errorf("unmap error: " + err.Error())
+	}
+	return nil
+}
+
+// mmapSize determines the appropriate size for the mmap given the current size
+// of the database. The minimum size is 1MB and doubles until it reaches 1GB.
+// Returns an error if the new mmap size is greater than the max allowed.
+func (db *DB) mmapSize(size int) (int, error) {
+	// Double the size from 1MB until 1GB.
+	for i := uint(20); i <= 30; i++ {
+		if size <= 1<<i {
+			return 1 << i, nil
+		}
+	}
+
+	// Verify the requested size is not above the maximum allowed.
+	if size > maxMapSize {
+		return 0, fmt.Errorf("mmap too large")
+	}
+
+	// If larger than 1GB then grow by 1GB at a time.
+	sz := int64(size)
+	if remainder := sz % int64(maxMmapStep); remainder > 0 {
+		sz += int64(maxMmapStep) - remainder
+	}
+
+	// Ensure that the mmap size is a multiple of the page size.
+	// This should always be true since we're incrementing in MBs.
+	pageSize := int64(db.pageSize)
+	if (sz % pageSize) != 0 {
+		sz = ((sz / pageSize) + 1) * pageSize
+	}
+
+	// If we've exceeded the max size then only grow up to the max size.
+	if sz > maxMapSize {
+		sz = maxMapSize
+	}
+
+	return int(sz), nil
+}
+
+// init creates a new database file and initializes its meta pages.
+func (db *DB) init() error {
+	// Set the page size to the OS page size.
+	db.pageSize = os.Getpagesize()
+
+	// Create two meta pages on a buffer.
+	buf := make([]byte, db.pageSize*4)
+	for i := 0; i < 2; i++ {
+		p := db.pageInBuffer(buf[:], pgid(i))
+		p.id = pgid(i)
+		p.flags = metaPageFlag
+
+		// Initialize the meta page.
+		m := p.meta()
+		m.magic = magic
+		m.version = version
+		m.pageSize = uint32(db.pageSize)
+		m.freelist = 2
+		m.root = bucket{root: 3}
+		m.pgid = 4
+		m.txid = txid(i)
+	}
+
+	// Write an empty freelist at page 3.
+	p := db.pageInBuffer(buf[:], pgid(2))
+	p.id = pgid(2)
+	p.flags = freelistPageFlag
+	p.count = 0
+
+	// Write an empty leaf page at page 4.
+	p = db.pageInBuffer(buf[:], pgid(3))
+	p.id = pgid(3)
+	p.flags = leafPageFlag
+	p.count = 0
+
+	// Write the buffer to our data file.
+	if _, err := db.ops.writeAt(buf, 0); err != nil {
+		return err
+	}
+	if err := fdatasync(db); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// Close releases all database resources.
+// All transactions must be closed before closing the database.
+func (db *DB) Close() error {
+	db.metalock.Lock()
+	defer db.metalock.Unlock()
+	return db.close()
+}
+
+func (db *DB) close() error {
+	db.opened = false
+
+	db.freelist = nil
+	db.path = ""
+
+	// Clear ops.
+	db.ops.writeAt = nil
+
+	// Close the mmap.
+	if err := db.munmap(); err != nil {
+		return err
+	}
+
+	// Close file handles.
+	if db.file != nil {
+		// Unlock the file.
+		_ = funlock(db.file)
+
+		// Close the file descriptor.
+		if err := db.file.Close(); err != nil {
+			return fmt.Errorf("db file close: %s", err)
+		}
+		db.file = nil
+	}
+
+	return nil
+}
+
+// Begin starts a new transaction.
+// Multiple read-only transactions can be used concurrently but only one
+// write transaction can be used at a time. Starting multiple write transactions
+// will cause the calls to block and be serialized until the current write
+// transaction finishes.
+//
+// IMPORTANT: You must close read-only transactions after you are finished or
+// else the database will not reclaim old pages.
+func (db *DB) Begin(writable bool) (*Tx, error) {
+	if writable {
+		return db.beginRWTx()
+	}
+	return db.beginTx()
+}
+
+func (db *DB) beginTx() (*Tx, error) {
+	// Lock the meta pages while we initialize the transaction. We obtain
+	// the meta lock before the mmap lock because that's the order that the
+	// write transaction will obtain them.
+	db.metalock.Lock()
+
+	// Obtain a read-only lock on the mmap. When the mmap is remapped it will
+	// obtain a write lock so all transactions must finish before it can be
+	// remapped.
+	db.mmaplock.RLock()
+
+	// Exit if the database is not open yet.
+	if !db.opened {
+		db.mmaplock.RUnlock()
+		db.metalock.Unlock()
+		return nil, ErrDatabaseNotOpen
+	}
+
+	// Create a transaction associated with the database.
+	t := &Tx{}
+	t.init(db)
+
+	// Keep track of transaction until it closes.
+	db.txs = append(db.txs, t)
+	n := len(db.txs)
+
+	// Unlock the meta pages.
+	db.metalock.Unlock()
+
+	// Update the transaction stats.
+	db.statlock.Lock()
+	db.stats.TxN++
+	db.stats.OpenTxN = n
+	db.statlock.Unlock()
+
+	return t, nil
+}
+
+func (db *DB) beginRWTx() (*Tx, error) {
+	// Obtain writer lock. This is released by the transaction when it closes.
+	// This enforces only one writer transaction at a time.
+	db.rwlock.Lock()
+
+	// Once we have the writer lock then we can lock the meta pages so that
+	// we can set up the transaction.
+	db.metalock.Lock()
+	defer db.metalock.Unlock()
+
+	// Exit if the database is not open yet.
+	if !db.opened {
+		db.rwlock.Unlock()
+		return nil, ErrDatabaseNotOpen
+	}
+
+	// Create a transaction associated with the database.
+	t := &Tx{writable: true}
+	t.init(db)
+	db.rwtx = t
+
+	// Free any pages associated with closed read-only transactions.
+	var minid txid = 0xFFFFFFFFFFFFFFFF
+	for _, t := range db.txs {
+		if t.meta.txid < minid {
+			minid = t.meta.txid
+		}
+	}
+	if minid > 0 {
+		db.freelist.release(minid - 1)
+	}
+
+	return t, nil
+}
+
+// removeTx removes a transaction from the database.
+func (db *DB) removeTx(tx *Tx) {
+	// Release the read lock on the mmap.
+	db.mmaplock.RUnlock()
+
+	// Use the meta lock to restrict access to the DB object.
+	db.metalock.Lock()
+
+	// Remove the transaction.
+	for i, t := range db.txs {
+		if t == tx {
+			db.txs = append(db.txs[:i], db.txs[i+1:]...)
+			break
+		}
+	}
+	n := len(db.txs)
+
+	// Unlock the meta pages.
+	db.metalock.Unlock()
+
+	// Merge statistics.
+	db.statlock.Lock()
+	db.stats.OpenTxN = n
+	db.stats.TxStats.add(&tx.stats)
+	db.statlock.Unlock()
+}
+
+// Update executes a function within the context of a read-write managed transaction.
+// If no error is returned from the function then the transaction is committed.
+// If an error is returned then the entire transaction is rolled back.
+// Any error that is returned from the function or returned from the commit is
+// returned from the Update() method.
+//
+// Attempting to manually commit or rollback within the function will cause a panic.
+func (db *DB) Update(fn func(*Tx) error) error {
+	t, err := db.Begin(true)
+	if err != nil {
+		return err
+	}
+
+	// Make sure the transaction rolls back in the event of a panic.
+	defer func() {
+		if t.db != nil {
+			t.rollback()
+		}
+	}()
+
+	// Mark as a managed tx so that the inner function cannot manually commit.
+	t.managed = true
+
+	// If an error is returned from the function then rollback and return error.
+	err = fn(t)
+	t.managed = false
+	if err != nil {
+		_ = t.Rollback()
+		return err
+	}
+
+	return t.Commit()
+}
+
+// View executes a function within the context of a managed read-only transaction.
+// Any error that is returned from the function is returned from the View() method.
+//
+// Attempting to manually rollback within the function will cause a panic.
+func (db *DB) View(fn func(*Tx) error) error {
+	t, err := db.Begin(false)
+	if err != nil {
+		return err
+	}
+
+	// Make sure the transaction rolls back in the event of a panic.
+	defer func() {
+		if t.db != nil {
+			t.rollback()
+		}
+	}()
+
+	// Mark as a managed tx so that the inner function cannot manually rollback.
+	t.managed = true
+
+	// If an error is returned from the function then pass it through.
+	err = fn(t)
+	t.managed = false
+	if err != nil {
+		_ = t.Rollback()
+		return err
+	}
+
+	if err := t.Rollback(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// Stats retrieves ongoing performance stats for the database.
+// This is only updated when a transaction closes.
+func (db *DB) Stats() Stats {
+	db.statlock.RLock()
+	defer db.statlock.RUnlock()
+	return db.stats
+}
+
+// This is for internal access to the raw data bytes from the C cursor, use
+// carefully, or not at all.
+func (db *DB) Info() *Info {
+	return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
+}
+
+// page retrieves a page reference from the mmap based on the current page size.
+func (db *DB) page(id pgid) *page {
+	pos := id * pgid(db.pageSize)
+	return (*page)(unsafe.Pointer(&db.data[pos]))
+}
+
+// pageInBuffer retrieves a page reference from a given byte array based on the current page size.
+func (db *DB) pageInBuffer(b []byte, id pgid) *page {
+	return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
+}
+
+// meta retrieves the current meta page reference.
+func (db *DB) meta() *meta {
+	if db.meta0.txid > db.meta1.txid {
+		return db.meta0
+	}
+	return db.meta1
+}
+
+// allocate returns a contiguous block of memory starting at a given page.
+func (db *DB) allocate(count int) (*page, error) {
+	// Allocate a temporary buffer for the page.
+	buf := make([]byte, count*db.pageSize)
+	p := (*page)(unsafe.Pointer(&buf[0]))
+	p.overflow = uint32(count - 1)
+
+	// Use pages from the freelist if they are available.
+	if p.id = db.freelist.allocate(count); p.id != 0 {
+		return p, nil
+	}
+
+	// Resize mmap() if we're at the end.
+	p.id = db.rwtx.meta.pgid
+	var minsz = int((p.id+pgid(count))+1) * db.pageSize
+	if minsz >= db.datasz {
+		if err := db.mmap(minsz); err != nil {
+			return nil, fmt.Errorf("mmap allocate error: %s", err)
+		}
+	}
+
+	// Move the page id high water mark.
+	db.rwtx.meta.pgid += pgid(count)
+
+	return p, nil
+}
+
+// Options represents the options that can be set when opening a database.
+type Options struct {
+	// Timeout is the amount of time to wait to obtain a file lock.
+	// When set to zero it will wait indefinitely. This option is only
+	// available on Darwin and Linux.
+	Timeout time.Duration
+}
+
+// DefaultOptions represent the options used if nil options are passed into Open().
+// No timeout is used which will cause Bolt to wait indefinitely for a lock.
+var DefaultOptions = &Options{
+	Timeout: 0,
+}
+
+// Stats represents statistics about the database.
+type Stats struct {
+	// Freelist stats
+	FreePageN     int // total number of free pages on the freelist
+	PendingPageN  int // total number of pending pages on the freelist
+	FreeAlloc     int // total bytes allocated in free pages
+	FreelistInuse int // total bytes used by the freelist
+
+	// Transaction stats
+	TxN     int // total number of started read transactions
+	OpenTxN int // number of currently open read transactions
+
+	TxStats TxStats // global, ongoing stats.
+}
+
+// Sub calculates and returns the difference between two sets of database stats.
+// This is useful when obtaining stats at two different points and time and
+// you need the performance counters that occurred within that time span.
+func (s *Stats) Sub(other *Stats) Stats {
+	if other == nil {
+		return *s
+	}
+	var diff Stats
+	diff.FreePageN = s.FreePageN
+	diff.PendingPageN = s.PendingPageN
+	diff.FreeAlloc = s.FreeAlloc
+	diff.FreelistInuse = s.FreelistInuse
+	diff.TxN = other.TxN - s.TxN
+	diff.TxStats = s.TxStats.Sub(&other.TxStats)
+	return diff
+}
+
+func (s *Stats) add(other *Stats) {
+	s.TxStats.add(&other.TxStats)
+}
+
+type Info struct {
+	Data     uintptr
+	PageSize int
+}
+
+type meta struct {
+	magic    uint32
+	version  uint32
+	pageSize uint32
+	flags    uint32
+	root     bucket
+	freelist pgid
+	pgid     pgid
+	txid     txid
+	checksum uint64
+}
+
+// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
+func (m *meta) validate() error {
+	if m.checksum != 0 && m.checksum != m.sum64() {
+		return ErrChecksum
+	} else if m.magic != magic {
+		return ErrInvalid
+	} else if m.version != version {
+		return ErrVersionMismatch
+	}
+	return nil
+}
+
+// copy copies one meta object to another.
+func (m *meta) copy(dest *meta) {
+	*dest = *m
+}
+
+// write writes the meta onto a page.
+func (m *meta) write(p *page) {
+	if m.root.root >= m.pgid {
+		panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
+	} else if m.freelist >= m.pgid {
+		panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
+	}
+
+	// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
+	p.id = pgid(m.txid % 2)
+	p.flags |= metaPageFlag
+
+	// Calculate the checksum.
+	m.checksum = m.sum64()
+
+	m.copy(p.meta())
+}
+
+// generates the checksum for the meta.
+func (m *meta) sum64() uint64 {
+	var h = fnv.New64a()
+	_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
+	return h.Sum64()
+}
+
+// _assert will panic with a given formatted message if the given condition is false.
+func _assert(condition bool, msg string, v ...interface{}) {
+	if !condition {
+		panic(fmt.Sprintf("assertion failed: "+msg, v...))
+	}
+}
+
+func warn(v ...interface{})              { fmt.Fprintln(os.Stderr, v...) }
+func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) }
+
+func printstack() {
+	stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n")
+	fmt.Fprintln(os.Stderr, stack)
+}

+ 790 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go

@@ -0,0 +1,790 @@
+package bolt_test
+
+import (
+	"encoding/binary"
+	"errors"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"regexp"
+	"runtime"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+var statsFlag = flag.Bool("stats", false, "show performance stats")
+
+// Ensure that opening a database with a bad path returns an error.
+func TestOpen_BadPath(t *testing.T) {
+	db, err := bolt.Open("", 0666, nil)
+	assert(t, err != nil, "err: %s", err)
+	assert(t, db == nil, "")
+}
+
+// Ensure that a database can be opened without error.
+func TestOpen(t *testing.T) {
+	path := tempfile()
+	defer os.Remove(path)
+	db, err := bolt.Open(path, 0666, nil)
+	assert(t, db != nil, "")
+	ok(t, err)
+	equals(t, db.Path(), path)
+	ok(t, db.Close())
+}
+
+// Ensure that opening an already open database file will timeout.
+func TestOpen_Timeout(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("timeout not supported on windows")
+	}
+
+	path := tempfile()
+	defer os.Remove(path)
+
+	// Open a data file.
+	db0, err := bolt.Open(path, 0666, nil)
+	assert(t, db0 != nil, "")
+	ok(t, err)
+
+	// Attempt to open the database again.
+	start := time.Now()
+	db1, err := bolt.Open(path, 0666, &bolt.Options{Timeout: 100 * time.Millisecond})
+	assert(t, db1 == nil, "")
+	equals(t, bolt.ErrTimeout, err)
+	assert(t, time.Since(start) > 100*time.Millisecond, "")
+
+	db0.Close()
+}
+
+// Ensure that opening an already open database file will wait until its closed.
+func TestOpen_Wait(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("timeout not supported on windows")
+	}
+
+	path := tempfile()
+	defer os.Remove(path)
+
+	// Open a data file.
+	db0, err := bolt.Open(path, 0666, nil)
+	assert(t, db0 != nil, "")
+	ok(t, err)
+
+	// Close it in just a bit.
+	time.AfterFunc(100*time.Millisecond, func() { db0.Close() })
+
+	// Attempt to open the database again.
+	start := time.Now()
+	db1, err := bolt.Open(path, 0666, &bolt.Options{Timeout: 200 * time.Millisecond})
+	assert(t, db1 != nil, "")
+	ok(t, err)
+	assert(t, time.Since(start) > 100*time.Millisecond, "")
+}
+
+// Ensure that opening a database does not increase its size.
+// https://github.com/boltdb/bolt/issues/291
+func TestOpen_Size(t *testing.T) {
+	// Open a data file.
+	db := NewTestDB()
+	path := db.Path()
+	defer db.Close()
+
+	// Insert until we get above the minimum 4MB size.
+	ok(t, db.Update(func(tx *bolt.Tx) error {
+		b, _ := tx.CreateBucketIfNotExists([]byte("data"))
+		for i := 0; i < 10000; i++ {
+			ok(t, b.Put([]byte(fmt.Sprintf("%04d", i)), make([]byte, 1000)))
+		}
+		return nil
+	}))
+
+	// Close database and grab the size.
+	db.DB.Close()
+	sz := fileSize(path)
+	if sz == 0 {
+		t.Fatalf("unexpected new file size: %d", sz)
+	}
+
+	// Reopen database, update, and check size again.
+	db0, err := bolt.Open(path, 0666, nil)
+	ok(t, err)
+	ok(t, db0.Update(func(tx *bolt.Tx) error { return tx.Bucket([]byte("data")).Put([]byte{0}, []byte{0}) }))
+	ok(t, db0.Close())
+	newSz := fileSize(path)
+	if newSz == 0 {
+		t.Fatalf("unexpected new file size: %d", newSz)
+	}
+
+	// Compare the original size with the new size.
+	if sz != newSz {
+		t.Fatalf("unexpected file growth: %d => %d", sz, newSz)
+	}
+}
+
+// Ensure that opening a database beyond the max step size does not increase its size.
+// https://github.com/boltdb/bolt/issues/303
+func TestOpen_Size_Large(t *testing.T) {
+	if testing.Short() {
+		t.Skip("short mode")
+	}
+
+	// Open a data file.
+	db := NewTestDB()
+	path := db.Path()
+	defer db.Close()
+
+	// Insert until we get above the minimum 4MB size.
+	var index uint64
+	for i := 0; i < 10000; i++ {
+		ok(t, db.Update(func(tx *bolt.Tx) error {
+			b, _ := tx.CreateBucketIfNotExists([]byte("data"))
+			for j := 0; j < 1000; j++ {
+				ok(t, b.Put(u64tob(index), make([]byte, 50)))
+				index++
+			}
+			return nil
+		}))
+	}
+
+	// Close database and grab the size.
+	db.DB.Close()
+	sz := fileSize(path)
+	if sz == 0 {
+		t.Fatalf("unexpected new file size: %d", sz)
+	} else if sz < (1 << 30) {
+		t.Fatalf("expected larger initial size: %d", sz)
+	}
+
+	// Reopen database, update, and check size again.
+	db0, err := bolt.Open(path, 0666, nil)
+	ok(t, err)
+	ok(t, db0.Update(func(tx *bolt.Tx) error { return tx.Bucket([]byte("data")).Put([]byte{0}, []byte{0}) }))
+	ok(t, db0.Close())
+	newSz := fileSize(path)
+	if newSz == 0 {
+		t.Fatalf("unexpected new file size: %d", newSz)
+	}
+
+	// Compare the original size with the new size.
+	if sz != newSz {
+		t.Fatalf("unexpected file growth: %d => %d", sz, newSz)
+	}
+}
+
+// Ensure that a re-opened database is consistent.
+func TestOpen_Check(t *testing.T) {
+	path := tempfile()
+	defer os.Remove(path)
+
+	db, err := bolt.Open(path, 0666, nil)
+	ok(t, err)
+	ok(t, db.View(func(tx *bolt.Tx) error { return <-tx.Check() }))
+	db.Close()
+
+	db, err = bolt.Open(path, 0666, nil)
+	ok(t, err)
+	ok(t, db.View(func(tx *bolt.Tx) error { return <-tx.Check() }))
+	db.Close()
+}
+
+// Ensure that the database returns an error if the file handle cannot be open.
+func TestDB_Open_FileError(t *testing.T) {
+	path := tempfile()
+	defer os.Remove(path)
+
+	_, err := bolt.Open(path+"/youre-not-my-real-parent", 0666, nil)
+	assert(t, err.(*os.PathError) != nil, "")
+	equals(t, path+"/youre-not-my-real-parent", err.(*os.PathError).Path)
+	equals(t, "open", err.(*os.PathError).Op)
+}
+
+// Ensure that write errors to the meta file handler during initialization are returned.
+func TestDB_Open_MetaInitWriteError(t *testing.T) {
+	t.Skip("pending")
+}
+
+// Ensure that a database that is too small returns an error.
+func TestDB_Open_FileTooSmall(t *testing.T) {
+	path := tempfile()
+	defer os.Remove(path)
+
+	db, err := bolt.Open(path, 0666, nil)
+	ok(t, err)
+	db.Close()
+
+	// corrupt the database
+	ok(t, os.Truncate(path, int64(os.Getpagesize())))
+
+	db, err = bolt.Open(path, 0666, nil)
+	equals(t, errors.New("file size too small"), err)
+}
+
+// TODO(benbjohnson): Test corruption at every byte of the first two pages.
+
+// Ensure that a database cannot open a transaction when it's not open.
+func TestDB_Begin_DatabaseNotOpen(t *testing.T) {
+	var db bolt.DB
+	tx, err := db.Begin(false)
+	assert(t, tx == nil, "")
+	equals(t, err, bolt.ErrDatabaseNotOpen)
+}
+
+// Ensure that a read-write transaction can be retrieved.
+func TestDB_BeginRW(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, err := db.Begin(true)
+	assert(t, tx != nil, "")
+	ok(t, err)
+	assert(t, tx.DB() == db.DB, "")
+	equals(t, tx.Writable(), true)
+	ok(t, tx.Commit())
+}
+
+// Ensure that opening a transaction while the DB is closed returns an error.
+func TestDB_BeginRW_Closed(t *testing.T) {
+	var db bolt.DB
+	tx, err := db.Begin(true)
+	equals(t, err, bolt.ErrDatabaseNotOpen)
+	assert(t, tx == nil, "")
+}
+
+// Ensure a database can provide a transactional block.
+func TestDB_Update(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	err := db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+		b.Put([]byte("foo"), []byte("bar"))
+		b.Put([]byte("baz"), []byte("bat"))
+		b.Delete([]byte("foo"))
+		return nil
+	})
+	ok(t, err)
+	err = db.View(func(tx *bolt.Tx) error {
+		assert(t, tx.Bucket([]byte("widgets")).Get([]byte("foo")) == nil, "")
+		equals(t, []byte("bat"), tx.Bucket([]byte("widgets")).Get([]byte("baz")))
+		return nil
+	})
+	ok(t, err)
+}
+
+// Ensure a closed database returns an error while running a transaction block
+func TestDB_Update_Closed(t *testing.T) {
+	var db bolt.DB
+	err := db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		return nil
+	})
+	equals(t, err, bolt.ErrDatabaseNotOpen)
+}
+
+// Ensure a panic occurs while trying to commit a managed transaction.
+func TestDB_Update_ManualCommit(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	var ok bool
+	db.Update(func(tx *bolt.Tx) error {
+		func() {
+			defer func() {
+				if r := recover(); r != nil {
+					ok = true
+				}
+			}()
+			tx.Commit()
+		}()
+		return nil
+	})
+	assert(t, ok, "expected panic")
+}
+
+// Ensure a panic occurs while trying to rollback a managed transaction.
+func TestDB_Update_ManualRollback(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	var ok bool
+	db.Update(func(tx *bolt.Tx) error {
+		func() {
+			defer func() {
+				if r := recover(); r != nil {
+					ok = true
+				}
+			}()
+			tx.Rollback()
+		}()
+		return nil
+	})
+	assert(t, ok, "expected panic")
+}
+
+// Ensure a panic occurs while trying to commit a managed transaction.
+func TestDB_View_ManualCommit(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	var ok bool
+	db.Update(func(tx *bolt.Tx) error {
+		func() {
+			defer func() {
+				if r := recover(); r != nil {
+					ok = true
+				}
+			}()
+			tx.Commit()
+		}()
+		return nil
+	})
+	assert(t, ok, "expected panic")
+}
+
+// Ensure a panic occurs while trying to rollback a managed transaction.
+func TestDB_View_ManualRollback(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	var ok bool
+	db.Update(func(tx *bolt.Tx) error {
+		func() {
+			defer func() {
+				if r := recover(); r != nil {
+					ok = true
+				}
+			}()
+			tx.Rollback()
+		}()
+		return nil
+	})
+	assert(t, ok, "expected panic")
+}
+
+// Ensure a write transaction that panics does not hold open locks.
+func TestDB_Update_Panic(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	func() {
+		defer func() {
+			if r := recover(); r != nil {
+				t.Log("recover: update", r)
+			}
+		}()
+		db.Update(func(tx *bolt.Tx) error {
+			tx.CreateBucket([]byte("widgets"))
+			panic("omg")
+		})
+	}()
+
+	// Verify we can update again.
+	err := db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		return err
+	})
+	ok(t, err)
+
+	// Verify that our change persisted.
+	err = db.Update(func(tx *bolt.Tx) error {
+		assert(t, tx.Bucket([]byte("widgets")) != nil, "")
+		return nil
+	})
+}
+
+// Ensure a database can return an error through a read-only transactional block.
+func TestDB_View_Error(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	err := db.View(func(tx *bolt.Tx) error {
+		return errors.New("xxx")
+	})
+	equals(t, errors.New("xxx"), err)
+}
+
+// Ensure a read transaction that panics does not hold open locks.
+func TestDB_View_Panic(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		return nil
+	})
+
+	func() {
+		defer func() {
+			if r := recover(); r != nil {
+				t.Log("recover: view", r)
+			}
+		}()
+		db.View(func(tx *bolt.Tx) error {
+			assert(t, tx.Bucket([]byte("widgets")) != nil, "")
+			panic("omg")
+		})
+	}()
+
+	// Verify that we can still use read transactions.
+	db.View(func(tx *bolt.Tx) error {
+		assert(t, tx.Bucket([]byte("widgets")) != nil, "")
+		return nil
+	})
+}
+
+// Ensure that an error is returned when a database write fails.
+func TestDB_Commit_WriteFail(t *testing.T) {
+	t.Skip("pending") // TODO(benbjohnson)
+}
+
+// Ensure that DB stats can be returned.
+func TestDB_Stats(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		return err
+	})
+	stats := db.Stats()
+	equals(t, 2, stats.TxStats.PageCount)
+	equals(t, 0, stats.FreePageN)
+	equals(t, 2, stats.PendingPageN)
+}
+
+// Ensure that database pages are in expected order and type.
+func TestDB_Consistency(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		return err
+	})
+
+	for i := 0; i < 10; i++ {
+		db.Update(func(tx *bolt.Tx) error {
+			ok(t, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")))
+			return nil
+		})
+	}
+	db.Update(func(tx *bolt.Tx) error {
+		p, _ := tx.Page(0)
+		assert(t, p != nil, "")
+		equals(t, "meta", p.Type)
+
+		p, _ = tx.Page(1)
+		assert(t, p != nil, "")
+		equals(t, "meta", p.Type)
+
+		p, _ = tx.Page(2)
+		assert(t, p != nil, "")
+		equals(t, "free", p.Type)
+
+		p, _ = tx.Page(3)
+		assert(t, p != nil, "")
+		equals(t, "free", p.Type)
+
+		p, _ = tx.Page(4)
+		assert(t, p != nil, "")
+		equals(t, "leaf", p.Type)
+
+		p, _ = tx.Page(5)
+		assert(t, p != nil, "")
+		equals(t, "freelist", p.Type)
+
+		p, _ = tx.Page(6)
+		assert(t, p == nil, "")
+		return nil
+	})
+}
+
+// Ensure that DB stats can be substracted from one another.
+func TestDBStats_Sub(t *testing.T) {
+	var a, b bolt.Stats
+	a.TxStats.PageCount = 3
+	a.FreePageN = 4
+	b.TxStats.PageCount = 10
+	b.FreePageN = 14
+	diff := b.Sub(&a)
+	equals(t, 7, diff.TxStats.PageCount)
+	// free page stats are copied from the receiver and not subtracted
+	equals(t, 14, diff.FreePageN)
+}
+
+func ExampleDB_Update() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Execute several commands within a write transaction.
+	err := db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("widgets"))
+		if err != nil {
+			return err
+		}
+		if err := b.Put([]byte("foo"), []byte("bar")); err != nil {
+			return err
+		}
+		return nil
+	})
+
+	// If our transactional block didn't return an error then our data is saved.
+	if err == nil {
+		db.View(func(tx *bolt.Tx) error {
+			value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+			fmt.Printf("The value of 'foo' is: %s\n", value)
+			return nil
+		})
+	}
+
+	// Output:
+	// The value of 'foo' is: bar
+}
+
+func ExampleDB_View() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Insert data into a bucket.
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("people"))
+		b := tx.Bucket([]byte("people"))
+		b.Put([]byte("john"), []byte("doe"))
+		b.Put([]byte("susy"), []byte("que"))
+		return nil
+	})
+
+	// Access data from within a read-only transactional block.
+	db.View(func(tx *bolt.Tx) error {
+		v := tx.Bucket([]byte("people")).Get([]byte("john"))
+		fmt.Printf("John's last name is %s.\n", v)
+		return nil
+	})
+
+	// Output:
+	// John's last name is doe.
+}
+
+func ExampleDB_Begin_ReadOnly() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Create a bucket.
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		return err
+	})
+
+	// Create several keys in a transaction.
+	tx, _ := db.Begin(true)
+	b := tx.Bucket([]byte("widgets"))
+	b.Put([]byte("john"), []byte("blue"))
+	b.Put([]byte("abby"), []byte("red"))
+	b.Put([]byte("zephyr"), []byte("purple"))
+	tx.Commit()
+
+	// Iterate over the values in sorted key order.
+	tx, _ = db.Begin(false)
+	c := tx.Bucket([]byte("widgets")).Cursor()
+	for k, v := c.First(); k != nil; k, v = c.Next() {
+		fmt.Printf("%s likes %s\n", k, v)
+	}
+	tx.Rollback()
+
+	// Output:
+	// abby likes red
+	// john likes blue
+	// zephyr likes purple
+}
+
+// TestDB represents a wrapper around a Bolt DB to handle temporary file
+// creation and automatic cleanup on close.
+type TestDB struct {
+	*bolt.DB
+}
+
+// NewTestDB returns a new instance of TestDB.
+func NewTestDB() *TestDB {
+	db, err := bolt.Open(tempfile(), 0666, nil)
+	if err != nil {
+		panic("cannot open db: " + err.Error())
+	}
+	return &TestDB{db}
+}
+
+// MustView executes a read-only function. Panic on error.
+func (db *TestDB) MustView(fn func(tx *bolt.Tx) error) {
+	if err := db.DB.View(func(tx *bolt.Tx) error {
+		return fn(tx)
+	}); err != nil {
+		panic(err.Error())
+	}
+}
+
+// MustUpdate executes a read-write function. Panic on error.
+func (db *TestDB) MustUpdate(fn func(tx *bolt.Tx) error) {
+	if err := db.DB.View(func(tx *bolt.Tx) error {
+		return fn(tx)
+	}); err != nil {
+		panic(err.Error())
+	}
+}
+
+// MustCreateBucket creates a new bucket. Panic on error.
+func (db *TestDB) MustCreateBucket(name []byte) {
+	if err := db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte(name))
+		return err
+	}); err != nil {
+		panic(err.Error())
+	}
+}
+
+// Close closes the database and deletes the underlying file.
+func (db *TestDB) Close() {
+	// Log statistics.
+	if *statsFlag {
+		db.PrintStats()
+	}
+
+	// Check database consistency after every test.
+	db.MustCheck()
+
+	// Close database and remove file.
+	defer os.Remove(db.Path())
+	db.DB.Close()
+}
+
+// PrintStats prints the database stats
+func (db *TestDB) PrintStats() {
+	var stats = db.Stats()
+	fmt.Printf("[db] %-20s %-20s %-20s\n",
+		fmt.Sprintf("pg(%d/%d)", stats.TxStats.PageCount, stats.TxStats.PageAlloc),
+		fmt.Sprintf("cur(%d)", stats.TxStats.CursorCount),
+		fmt.Sprintf("node(%d/%d)", stats.TxStats.NodeCount, stats.TxStats.NodeDeref),
+	)
+	fmt.Printf("     %-20s %-20s %-20s\n",
+		fmt.Sprintf("rebal(%d/%v)", stats.TxStats.Rebalance, truncDuration(stats.TxStats.RebalanceTime)),
+		fmt.Sprintf("spill(%d/%v)", stats.TxStats.Spill, truncDuration(stats.TxStats.SpillTime)),
+		fmt.Sprintf("w(%d/%v)", stats.TxStats.Write, truncDuration(stats.TxStats.WriteTime)),
+	)
+}
+
+// MustCheck runs a consistency check on the database and panics if any errors are found.
+func (db *TestDB) MustCheck() {
+	db.View(func(tx *bolt.Tx) error {
+		// Collect all the errors.
+		var errors []error
+		for err := range tx.Check() {
+			errors = append(errors, err)
+			if len(errors) > 10 {
+				break
+			}
+		}
+
+		// If errors occurred, copy the DB and print the errors.
+		if len(errors) > 0 {
+			var path = tempfile()
+			tx.CopyFile(path, 0600)
+
+			// Print errors.
+			fmt.Print("\n\n")
+			fmt.Printf("consistency check failed (%d errors)\n", len(errors))
+			for _, err := range errors {
+				fmt.Println(err)
+			}
+			fmt.Println("")
+			fmt.Println("db saved to:")
+			fmt.Println(path)
+			fmt.Print("\n\n")
+			os.Exit(-1)
+		}
+
+		return nil
+	})
+}
+
+// CopyTempFile copies a database to a temporary file.
+func (db *TestDB) CopyTempFile() {
+	path := tempfile()
+	db.View(func(tx *bolt.Tx) error { return tx.CopyFile(path, 0600) })
+	fmt.Println("db copied to: ", path)
+}
+
+// tempfile returns a temporary file path.
+func tempfile() string {
+	f, _ := ioutil.TempFile("", "bolt-")
+	f.Close()
+	os.Remove(f.Name())
+	return f.Name()
+}
+
+// mustContainKeys checks that a bucket contains a given set of keys.
+func mustContainKeys(b *bolt.Bucket, m map[string]string) {
+	found := make(map[string]string)
+	b.ForEach(func(k, _ []byte) error {
+		found[string(k)] = ""
+		return nil
+	})
+
+	// Check for keys found in bucket that shouldn't be there.
+	var keys []string
+	for k, _ := range found {
+		if _, ok := m[string(k)]; !ok {
+			keys = append(keys, k)
+		}
+	}
+	if len(keys) > 0 {
+		sort.Strings(keys)
+		panic(fmt.Sprintf("keys found(%d): %s", len(keys), strings.Join(keys, ",")))
+	}
+
+	// Check for keys not found in bucket that should be there.
+	for k, _ := range m {
+		if _, ok := found[string(k)]; !ok {
+			keys = append(keys, k)
+		}
+	}
+	if len(keys) > 0 {
+		sort.Strings(keys)
+		panic(fmt.Sprintf("keys not found(%d): %s", len(keys), strings.Join(keys, ",")))
+	}
+}
+
+func trunc(b []byte, length int) []byte {
+	if length < len(b) {
+		return b[:length]
+	}
+	return b
+}
+
+func truncDuration(d time.Duration) string {
+	return regexp.MustCompile(`^(\d+)(\.\d+)`).ReplaceAllString(d.String(), "$1")
+}
+
+func fileSize(path string) int64 {
+	fi, err := os.Stat(path)
+	if err != nil {
+		return 0
+	}
+	return fi.Size()
+}
+
+func warn(v ...interface{})              { fmt.Fprintln(os.Stderr, v...) }
+func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) }
+
+// u64tob converts a uint64 into an 8-byte slice.
+func u64tob(v uint64) []byte {
+	b := make([]byte, 8)
+	binary.BigEndian.PutUint64(b, v)
+	return b
+}
+
+// btou64 converts an 8-byte slice into an uint64.
+func btou64(b []byte) uint64 { return binary.BigEndian.Uint64(b) }

+ 44 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/doc.go

@@ -0,0 +1,44 @@
+/*
+Package bolt implements a low-level key/value store in pure Go. It supports
+fully serializable transactions, ACID semantics, and lock-free MVCC with
+multiple readers and a single writer. Bolt can be used for projects that
+want a simple data store without the need to add large dependencies such as
+Postgres or MySQL.
+
+Bolt is a single-level, zero-copy, B+tree data store. This means that Bolt is
+optimized for fast read access and does not require recovery in the event of a
+system crash. Transactions which have not finished committing will simply be
+rolled back in the event of a crash.
+
+The design of Bolt is based on Howard Chu's LMDB database project.
+
+Bolt currently works on Windows, Mac OS X, and Linux.
+
+
+Basics
+
+There are only a few types in Bolt: DB, Bucket, Tx, and Cursor. The DB is
+a collection of buckets and is represented by a single file on disk. A bucket is
+a collection of unique keys that are associated with values.
+
+Transactions provide either read-only or read-write access to the database.
+Read-only transactions can retrieve key/value pairs and can use Cursors to
+iterate over the dataset sequentially. Read-write transactions can create and
+delete buckets and can insert and remove keys. Only one read-write transaction
+is allowed at a time.
+
+
+Caveats
+
+The database uses a read-only, memory-mapped data file to ensure that
+applications cannot corrupt the database, however, this means that keys and
+values returned from Bolt cannot be changed. Writing to a read-only byte slice
+will cause Go to panic.
+
+Keys and values retrieved from the database are only valid for the life of
+the transaction. When used outside the transaction, these byte slices can
+point to different data or can point to invalid memory which will cause a panic.
+
+
+*/
+package bolt

+ 66 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/errors.go

@@ -0,0 +1,66 @@
+package bolt
+
+import "errors"
+
+// These errors can be returned when opening or calling methods on a DB.
+var (
+	// ErrDatabaseNotOpen is returned when a DB instance is accessed before it
+	// is opened or after it is closed.
+	ErrDatabaseNotOpen = errors.New("database not open")
+
+	// ErrDatabaseOpen is returned when opening a database that is
+	// already open.
+	ErrDatabaseOpen = errors.New("database already open")
+
+	// ErrInvalid is returned when a data file is not a Bolt-formatted database.
+	ErrInvalid = errors.New("invalid database")
+
+	// ErrVersionMismatch is returned when the data file was created with a
+	// different version of Bolt.
+	ErrVersionMismatch = errors.New("version mismatch")
+
+	// ErrChecksum is returned when either meta page checksum does not match.
+	ErrChecksum = errors.New("checksum error")
+
+	// ErrTimeout is returned when a database cannot obtain an exclusive lock
+	// on the data file after the timeout passed to Open().
+	ErrTimeout = errors.New("timeout")
+)
+
+// These errors can occur when beginning or committing a Tx.
+var (
+	// ErrTxNotWritable is returned when performing a write operation on a
+	// read-only transaction.
+	ErrTxNotWritable = errors.New("tx not writable")
+
+	// ErrTxClosed is returned when committing or rolling back a transaction
+	// that has already been committed or rolled back.
+	ErrTxClosed = errors.New("tx closed")
+)
+
+// These errors can occur when putting or deleting a value or a bucket.
+var (
+	// ErrBucketNotFound is returned when trying to access a bucket that has
+	// not been created yet.
+	ErrBucketNotFound = errors.New("bucket not found")
+
+	// ErrBucketExists is returned when creating a bucket that already exists.
+	ErrBucketExists = errors.New("bucket already exists")
+
+	// ErrBucketNameRequired is returned when creating a bucket with a blank name.
+	ErrBucketNameRequired = errors.New("bucket name required")
+
+	// ErrKeyRequired is returned when inserting a zero-length key.
+	ErrKeyRequired = errors.New("key required")
+
+	// ErrKeyTooLarge is returned when inserting a key that is larger than MaxKeySize.
+	ErrKeyTooLarge = errors.New("key too large")
+
+	// ErrValueTooLarge is returned when inserting a value that is larger than MaxValueSize.
+	ErrValueTooLarge = errors.New("value too large")
+
+	// ErrIncompatibleValue is returned when trying create or delete a bucket
+	// on an existing non-bucket key or when trying to create or delete a
+	// non-bucket key on an existing bucket key.
+	ErrIncompatibleValue = errors.New("incompatible value")
+)

+ 241 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/freelist.go

@@ -0,0 +1,241 @@
+package bolt
+
+import (
+	"fmt"
+	"sort"
+	"unsafe"
+)
+
+// freelist represents a list of all pages that are available for allocation.
+// It also tracks pages that have been freed but are still in use by open transactions.
+type freelist struct {
+	ids     []pgid          // all free and available free page ids.
+	pending map[txid][]pgid // mapping of soon-to-be free page ids by tx.
+	cache   map[pgid]bool   // fast lookup of all free and pending page ids.
+}
+
+// newFreelist returns an empty, initialized freelist.
+func newFreelist() *freelist {
+	return &freelist{
+		pending: make(map[txid][]pgid),
+		cache:   make(map[pgid]bool),
+	}
+}
+
+// size returns the size of the page after serialization.
+func (f *freelist) size() int {
+	return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * f.count())
+}
+
+// count returns count of pages on the freelist
+func (f *freelist) count() int {
+	return f.free_count() + f.pending_count()
+}
+
+// free_count returns count of free pages
+func (f *freelist) free_count() int {
+	return len(f.ids)
+}
+
+// pending_count returns count of pending pages
+func (f *freelist) pending_count() int {
+	var count int
+	for _, list := range f.pending {
+		count += len(list)
+	}
+	return count
+}
+
+// all returns a list of all free ids and all pending ids in one sorted list.
+func (f *freelist) all() []pgid {
+	ids := make([]pgid, len(f.ids))
+	copy(ids, f.ids)
+
+	for _, list := range f.pending {
+		ids = append(ids, list...)
+	}
+
+	sort.Sort(pgids(ids))
+	return ids
+}
+
+// allocate returns the starting page id of a contiguous list of pages of a given size.
+// If a contiguous block cannot be found then 0 is returned.
+func (f *freelist) allocate(n int) pgid {
+	if len(f.ids) == 0 {
+		return 0
+	}
+
+	var initial, previd pgid
+	for i, id := range f.ids {
+		if id <= 1 {
+			panic(fmt.Sprintf("invalid page allocation: %d", id))
+		}
+
+		// Reset initial page if this is not contiguous.
+		if previd == 0 || id-previd != 1 {
+			initial = id
+		}
+
+		// If we found a contiguous block then remove it and return it.
+		if (id-initial)+1 == pgid(n) {
+			// If we're allocating off the beginning then take the fast path
+			// and just adjust the existing slice. This will use extra memory
+			// temporarily but the append() in free() will realloc the slice
+			// as is necessary.
+			if (i + 1) == n {
+				f.ids = f.ids[i+1:]
+			} else {
+				copy(f.ids[i-n+1:], f.ids[i+1:])
+				f.ids = f.ids[:len(f.ids)-n]
+			}
+
+			// Remove from the free cache.
+			for i := pgid(0); i < pgid(n); i++ {
+				delete(f.cache, initial+i)
+			}
+
+			return initial
+		}
+
+		previd = id
+	}
+	return 0
+}
+
+// free releases a page and its overflow for a given transaction id.
+// If the page is already free then a panic will occur.
+func (f *freelist) free(txid txid, p *page) {
+	if p.id <= 1 {
+		panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id))
+	}
+
+	// Free page and all its overflow pages.
+	var ids = f.pending[txid]
+	for id := p.id; id <= p.id+pgid(p.overflow); id++ {
+		// Verify that page is not already free.
+		if f.cache[id] {
+			panic(fmt.Sprintf("page %d already freed", id))
+		}
+
+		// Add to the freelist and cache.
+		ids = append(ids, id)
+		f.cache[id] = true
+	}
+	f.pending[txid] = ids
+}
+
+// release moves all page ids for a transaction id (or older) to the freelist.
+func (f *freelist) release(txid txid) {
+	for tid, ids := range f.pending {
+		if tid <= txid {
+			// Move transaction's pending pages to the available freelist.
+			// Don't remove from the cache since the page is still free.
+			f.ids = append(f.ids, ids...)
+			delete(f.pending, tid)
+		}
+	}
+	sort.Sort(pgids(f.ids))
+}
+
+// rollback removes the pages from a given pending tx.
+func (f *freelist) rollback(txid txid) {
+	// Remove page ids from cache.
+	for _, id := range f.pending[txid] {
+		delete(f.cache, id)
+	}
+
+	// Remove pages from pending list.
+	delete(f.pending, txid)
+}
+
+// freed returns whether a given page is in the free list.
+func (f *freelist) freed(pgid pgid) bool {
+	return f.cache[pgid]
+}
+
+// read initializes the freelist from a freelist page.
+func (f *freelist) read(p *page) {
+	// If the page.count is at the max uint16 value (64k) then it's considered
+	// an overflow and the size of the freelist is stored as the first element.
+	idx, count := 0, int(p.count)
+	if count == 0xFFFF {
+		idx = 1
+		count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0])
+	}
+
+	// Copy the list of page ids from the freelist.
+	ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count]
+	f.ids = make([]pgid, len(ids))
+	copy(f.ids, ids)
+
+	// Make sure they're sorted.
+	sort.Sort(pgids(f.ids))
+
+	// Rebuild the page cache.
+	f.reindex()
+}
+
+// write writes the page ids onto a freelist page. All free and pending ids are
+// saved to disk since in the event of a program crash, all pending ids will
+// become free.
+func (f *freelist) write(p *page) error {
+	// Combine the old free pgids and pgids waiting on an open transaction.
+	ids := f.all()
+
+	// Update the header flag.
+	p.flags |= freelistPageFlag
+
+	// The page.count can only hold up to 64k elements so if we overflow that
+	// number then we handle it by putting the size in the first element.
+	if len(ids) < 0xFFFF {
+		p.count = uint16(len(ids))
+		copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:], ids)
+	} else {
+		p.count = 0xFFFF
+		((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(len(ids))
+		copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:], ids)
+	}
+
+	return nil
+}
+
+// reload reads the freelist from a page and filters out pending items.
+func (f *freelist) reload(p *page) {
+	f.read(p)
+
+	// Build a cache of only pending pages.
+	pcache := make(map[pgid]bool)
+	for _, pendingIDs := range f.pending {
+		for _, pendingID := range pendingIDs {
+			pcache[pendingID] = true
+		}
+	}
+
+	// Check each page in the freelist and build a new available freelist
+	// with any pages not in the pending lists.
+	var a []pgid
+	for _, id := range f.ids {
+		if !pcache[id] {
+			a = append(a, id)
+		}
+	}
+	f.ids = a
+
+	// Once the available list is rebuilt then rebuild the free cache so that
+	// it includes the available and pending free pages.
+	f.reindex()
+}
+
+// reindex rebuilds the free cache based on available and pending free lists.
+func (f *freelist) reindex() {
+	f.cache = make(map[pgid]bool)
+	for _, id := range f.ids {
+		f.cache[id] = true
+	}
+	for _, pendingIDs := range f.pending {
+		for _, pendingID := range pendingIDs {
+			f.cache[pendingID] = true
+		}
+	}
+}

+ 129 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/freelist_test.go

@@ -0,0 +1,129 @@
+package bolt
+
+import (
+	"reflect"
+	"testing"
+	"unsafe"
+)
+
+// Ensure that a page is added to a transaction's freelist.
+func TestFreelist_free(t *testing.T) {
+	f := newFreelist()
+	f.free(100, &page{id: 12})
+	if !reflect.DeepEqual([]pgid{12}, f.pending[100]) {
+		t.Fatalf("exp=%v; got=%v", []pgid{12}, f.pending[100])
+	}
+}
+
+// Ensure that a page and its overflow is added to a transaction's freelist.
+func TestFreelist_free_overflow(t *testing.T) {
+	f := newFreelist()
+	f.free(100, &page{id: 12, overflow: 3})
+	if exp := []pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.pending[100]) {
+		t.Fatalf("exp=%v; got=%v", exp, f.pending[100])
+	}
+}
+
+// Ensure that a transaction's free pages can be released.
+func TestFreelist_release(t *testing.T) {
+	f := newFreelist()
+	f.free(100, &page{id: 12, overflow: 1})
+	f.free(100, &page{id: 9})
+	f.free(102, &page{id: 39})
+	f.release(100)
+	f.release(101)
+	if exp := []pgid{9, 12, 13}; !reflect.DeepEqual(exp, f.ids) {
+		t.Fatalf("exp=%v; got=%v", exp, f.ids)
+	}
+
+	f.release(102)
+	if exp := []pgid{9, 12, 13, 39}; !reflect.DeepEqual(exp, f.ids) {
+		t.Fatalf("exp=%v; got=%v", exp, f.ids)
+	}
+}
+
+// Ensure that a freelist can find contiguous blocks of pages.
+func TestFreelist_allocate(t *testing.T) {
+	f := &freelist{ids: []pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}}
+	if id := int(f.allocate(3)); id != 3 {
+		t.Fatalf("exp=3; got=%v", id)
+	}
+	if id := int(f.allocate(1)); id != 6 {
+		t.Fatalf("exp=6; got=%v", id)
+	}
+	if id := int(f.allocate(3)); id != 0 {
+		t.Fatalf("exp=0; got=%v", id)
+	}
+	if id := int(f.allocate(2)); id != 12 {
+		t.Fatalf("exp=12; got=%v", id)
+	}
+	if id := int(f.allocate(1)); id != 7 {
+		t.Fatalf("exp=7; got=%v", id)
+	}
+	if id := int(f.allocate(0)); id != 0 {
+		t.Fatalf("exp=0; got=%v", id)
+	}
+	if id := int(f.allocate(0)); id != 0 {
+		t.Fatalf("exp=0; got=%v", id)
+	}
+	if exp := []pgid{9, 18}; !reflect.DeepEqual(exp, f.ids) {
+		t.Fatalf("exp=%v; got=%v", exp, f.ids)
+	}
+
+	if id := int(f.allocate(1)); id != 9 {
+		t.Fatalf("exp=9; got=%v", id)
+	}
+	if id := int(f.allocate(1)); id != 18 {
+		t.Fatalf("exp=18; got=%v", id)
+	}
+	if id := int(f.allocate(1)); id != 0 {
+		t.Fatalf("exp=0; got=%v", id)
+	}
+	if exp := []pgid{}; !reflect.DeepEqual(exp, f.ids) {
+		t.Fatalf("exp=%v; got=%v", exp, f.ids)
+	}
+}
+
+// Ensure that a freelist can deserialize from a freelist page.
+func TestFreelist_read(t *testing.T) {
+	// Create a page.
+	var buf [4096]byte
+	page := (*page)(unsafe.Pointer(&buf[0]))
+	page.flags = freelistPageFlag
+	page.count = 2
+
+	// Insert 2 page ids.
+	ids := (*[3]pgid)(unsafe.Pointer(&page.ptr))
+	ids[0] = 23
+	ids[1] = 50
+
+	// Deserialize page into a freelist.
+	f := newFreelist()
+	f.read(page)
+
+	// Ensure that there are two page ids in the freelist.
+	if exp := []pgid{23, 50}; !reflect.DeepEqual(exp, f.ids) {
+		t.Fatalf("exp=%v; got=%v", exp, f.ids)
+	}
+}
+
+// Ensure that a freelist can serialize into a freelist page.
+func TestFreelist_write(t *testing.T) {
+	// Create a freelist and write it to a page.
+	var buf [4096]byte
+	f := &freelist{ids: []pgid{12, 39}, pending: make(map[txid][]pgid)}
+	f.pending[100] = []pgid{28, 11}
+	f.pending[101] = []pgid{3}
+	p := (*page)(unsafe.Pointer(&buf[0]))
+	f.write(p)
+
+	// Read the page back out.
+	f2 := newFreelist()
+	f2.read(p)
+
+	// Ensure that the freelist is correct.
+	// All pages should be present and in reverse order.
+	if exp := []pgid{3, 11, 12, 28, 39}; !reflect.DeepEqual(exp, f2.ids) {
+		t.Fatalf("exp=%v; got=%v", exp, f2.ids)
+	}
+}

+ 627 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/node.go

@@ -0,0 +1,627 @@
+package bolt
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"unsafe"
+)
+
+// node represents an in-memory, deserialized page.
+type node struct {
+	bucket     *Bucket
+	isLeaf     bool
+	unbalanced bool
+	spilled    bool
+	key        []byte
+	pgid       pgid
+	parent     *node
+	children   nodes
+	inodes     inodes
+}
+
+// root returns the top-level node this node is attached to.
+func (n *node) root() *node {
+	if n.parent == nil {
+		return n
+	}
+	return n.parent.root()
+}
+
+// minKeys returns the minimum number of inodes this node should have.
+func (n *node) minKeys() int {
+	if n.isLeaf {
+		return 1
+	}
+	return 2
+}
+
+// size returns the size of the node after serialization.
+func (n *node) size() int {
+	sz, elsz := pageHeaderSize, n.pageElementSize()
+	for i := 0; i < len(n.inodes); i++ {
+		item := &n.inodes[i]
+		sz += elsz + len(item.key) + len(item.value)
+	}
+	return sz
+}
+
+// sizeLessThan returns true if the node is less than a given size.
+// This is an optimization to avoid calculating a large node when we only need
+// to know if it fits inside a certain page size.
+func (n *node) sizeLessThan(v int) bool {
+	sz, elsz := pageHeaderSize, n.pageElementSize()
+	for i := 0; i < len(n.inodes); i++ {
+		item := &n.inodes[i]
+		sz += elsz + len(item.key) + len(item.value)
+		if sz >= v {
+			return false
+		}
+	}
+	return true
+}
+
+// pageElementSize returns the size of each page element based on the type of node.
+func (n *node) pageElementSize() int {
+	if n.isLeaf {
+		return leafPageElementSize
+	}
+	return branchPageElementSize
+}
+
+// childAt returns the child node at a given index.
+func (n *node) childAt(index int) *node {
+	if n.isLeaf {
+		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
+	}
+	return n.bucket.node(n.inodes[index].pgid, n)
+}
+
+// childIndex returns the index of a given child node.
+func (n *node) childIndex(child *node) int {
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
+	return index
+}
+
+// numChildren returns the number of children.
+func (n *node) numChildren() int {
+	return len(n.inodes)
+}
+
+// nextSibling returns the next node with the same parent.
+func (n *node) nextSibling() *node {
+	if n.parent == nil {
+		return nil
+	}
+	index := n.parent.childIndex(n)
+	if index >= n.parent.numChildren()-1 {
+		return nil
+	}
+	return n.parent.childAt(index + 1)
+}
+
+// prevSibling returns the previous node with the same parent.
+func (n *node) prevSibling() *node {
+	if n.parent == nil {
+		return nil
+	}
+	index := n.parent.childIndex(n)
+	if index == 0 {
+		return nil
+	}
+	return n.parent.childAt(index - 1)
+}
+
+// put inserts a key/value.
+func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
+	if pgid >= n.bucket.tx.meta.pgid {
+		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
+	} else if len(oldKey) <= 0 {
+		panic("put: zero-length old key")
+	} else if len(newKey) <= 0 {
+		panic("put: zero-length new key")
+	}
+
+	// Find insertion index.
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
+
+	// Add capacity and shift nodes if we don't have an exact match and need to insert.
+	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
+	if !exact {
+		n.inodes = append(n.inodes, inode{})
+		copy(n.inodes[index+1:], n.inodes[index:])
+	}
+
+	inode := &n.inodes[index]
+	inode.flags = flags
+	inode.key = newKey
+	inode.value = value
+	inode.pgid = pgid
+	_assert(len(inode.key) > 0, "put: zero-length inode key")
+}
+
+// del removes a key from the node.
+func (n *node) del(key []byte) {
+	// Find index of key.
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
+
+	// Exit if the key isn't found.
+	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
+		return
+	}
+
+	// Delete inode from the node.
+	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
+
+	// Mark the node as needing rebalancing.
+	n.unbalanced = true
+}
+
+// read initializes the node from a page.
+func (n *node) read(p *page) {
+	n.pgid = p.id
+	n.isLeaf = ((p.flags & leafPageFlag) != 0)
+	n.inodes = make(inodes, int(p.count))
+
+	for i := 0; i < int(p.count); i++ {
+		inode := &n.inodes[i]
+		if n.isLeaf {
+			elem := p.leafPageElement(uint16(i))
+			inode.flags = elem.flags
+			inode.key = elem.key()
+			inode.value = elem.value()
+		} else {
+			elem := p.branchPageElement(uint16(i))
+			inode.pgid = elem.pgid
+			inode.key = elem.key()
+		}
+		_assert(len(inode.key) > 0, "read: zero-length inode key")
+	}
+
+	// Save first key so we can find the node in the parent when we spill.
+	if len(n.inodes) > 0 {
+		n.key = n.inodes[0].key
+		_assert(len(n.key) > 0, "read: zero-length node key")
+	} else {
+		n.key = nil
+	}
+}
+
+// write writes the items onto one or more pages.
+func (n *node) write(p *page) {
+	// Initialize page.
+	if n.isLeaf {
+		p.flags |= leafPageFlag
+	} else {
+		p.flags |= branchPageFlag
+	}
+
+	if len(n.inodes) >= 0xFFFF {
+		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
+	}
+	p.count = uint16(len(n.inodes))
+
+	// Loop over each item and write it to the page.
+	b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
+	for i, item := range n.inodes {
+		_assert(len(item.key) > 0, "write: zero-length inode key")
+
+		// Write the page element.
+		if n.isLeaf {
+			elem := p.leafPageElement(uint16(i))
+			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
+			elem.flags = item.flags
+			elem.ksize = uint32(len(item.key))
+			elem.vsize = uint32(len(item.value))
+		} else {
+			elem := p.branchPageElement(uint16(i))
+			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
+			elem.ksize = uint32(len(item.key))
+			elem.pgid = item.pgid
+			_assert(elem.pgid != p.id, "write: circular dependency occurred")
+		}
+
+		// Write data for the element to the end of the page.
+		copy(b[0:], item.key)
+		b = b[len(item.key):]
+		copy(b[0:], item.value)
+		b = b[len(item.value):]
+	}
+
+	// DEBUG ONLY: n.dump()
+}
+
+// split breaks up a node into multiple smaller nodes, if appropriate.
+// This should only be called from the spill() function.
+func (n *node) split(pageSize int) []*node {
+	var nodes []*node
+
+	node := n
+	for {
+		// Split node into two.
+		a, b := node.splitTwo(pageSize)
+		nodes = append(nodes, a)
+
+		// If we can't split then exit the loop.
+		if b == nil {
+			break
+		}
+
+		// Set node to b so it gets split on the next iteration.
+		node = b
+	}
+
+	return nodes
+}
+
+// splitTwo breaks up a node into two smaller nodes, if appropriate.
+// This should only be called from the split() function.
+func (n *node) splitTwo(pageSize int) (*node, *node) {
+	// Ignore the split if the page doesn't have at least enough nodes for
+	// two pages or if the nodes can fit in a single page.
+	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
+		return n, nil
+	}
+
+	// Determine the threshold before starting a new node.
+	var fillPercent = n.bucket.FillPercent
+	if fillPercent < minFillPercent {
+		fillPercent = minFillPercent
+	} else if fillPercent > maxFillPercent {
+		fillPercent = maxFillPercent
+	}
+	threshold := int(float64(pageSize) * fillPercent)
+
+	// Determine split position and sizes of the two pages.
+	splitIndex, _ := n.splitIndex(threshold)
+
+	// Split node into two separate nodes.
+	// If there's no parent then we'll need to create one.
+	if n.parent == nil {
+		n.parent = &node{bucket: n.bucket, children: []*node{n}}
+	}
+
+	// Create a new node and add it to the parent.
+	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
+	n.parent.children = append(n.parent.children, next)
+
+	// Split inodes across two nodes.
+	next.inodes = n.inodes[splitIndex:]
+	n.inodes = n.inodes[:splitIndex]
+
+	// Update the statistics.
+	n.bucket.tx.stats.Split++
+
+	return n, next
+}
+
+// splitIndex finds the position where a page will fill a given threshold.
+// It returns the index as well as the size of the first page.
+// This is only be called from split().
+func (n *node) splitIndex(threshold int) (index, sz int) {
+	sz = pageHeaderSize
+
+	// Loop until we only have the minimum number of keys required for the second page.
+	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
+		index = i
+		inode := n.inodes[i]
+		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
+
+		// If we have at least the minimum number of keys and adding another
+		// node would put us over the threshold then exit and return.
+		if i >= minKeysPerPage && sz+elsize > threshold {
+			break
+		}
+
+		// Add the element size to the total size.
+		sz += elsize
+	}
+
+	return
+}
+
+// spill writes the nodes to dirty pages and splits nodes as it goes.
+// Returns an error if dirty pages cannot be allocated.
+func (n *node) spill() error {
+	var tx = n.bucket.tx
+	if n.spilled {
+		return nil
+	}
+
+	// Spill child nodes first. Child nodes can materialize sibling nodes in
+	// the case of split-merge so we cannot use a range loop. We have to check
+	// the children size on every loop iteration.
+	sort.Sort(n.children)
+	for i := 0; i < len(n.children); i++ {
+		if err := n.children[i].spill(); err != nil {
+			return err
+		}
+	}
+
+	// We no longer need the child list because it's only used for spill tracking.
+	n.children = nil
+
+	// Split nodes into appropriate sizes. The first node will always be n.
+	var nodes = n.split(tx.db.pageSize)
+	for _, node := range nodes {
+		// Add node's page to the freelist if it's not new.
+		if node.pgid > 0 {
+			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
+			node.pgid = 0
+		}
+
+		// Allocate contiguous space for the node.
+		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
+		if err != nil {
+			return err
+		}
+
+		// Write the node.
+		if p.id >= tx.meta.pgid {
+			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
+		}
+		node.pgid = p.id
+		node.write(p)
+		node.spilled = true
+
+		// Insert into parent inodes.
+		if node.parent != nil {
+			var key = node.key
+			if key == nil {
+				key = node.inodes[0].key
+			}
+
+			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
+			node.key = node.inodes[0].key
+			_assert(len(node.key) > 0, "spill: zero-length node key")
+		}
+
+		// Update the statistics.
+		tx.stats.Spill++
+	}
+
+	// If the root node split and created a new root then we need to spill that
+	// as well. We'll clear out the children to make sure it doesn't try to respill.
+	if n.parent != nil && n.parent.pgid == 0 {
+		n.children = nil
+		return n.parent.spill()
+	}
+
+	return nil
+}
+
+// rebalance attempts to combine the node with sibling nodes if the node fill
+// size is below a threshold or if there are not enough keys.
+func (n *node) rebalance() {
+	if !n.unbalanced {
+		return
+	}
+	n.unbalanced = false
+
+	// Update statistics.
+	n.bucket.tx.stats.Rebalance++
+
+	// Ignore if node is above threshold (25%) and has enough keys.
+	var threshold = n.bucket.tx.db.pageSize / 4
+	if n.size() > threshold && len(n.inodes) > n.minKeys() {
+		return
+	}
+
+	// Root node has special handling.
+	if n.parent == nil {
+		// If root node is a branch and only has one node then collapse it.
+		if !n.isLeaf && len(n.inodes) == 1 {
+			// Move root's child up.
+			child := n.bucket.node(n.inodes[0].pgid, n)
+			n.isLeaf = child.isLeaf
+			n.inodes = child.inodes[:]
+			n.children = child.children
+
+			// Reparent all child nodes being moved.
+			for _, inode := range n.inodes {
+				if child, ok := n.bucket.nodes[inode.pgid]; ok {
+					child.parent = n
+				}
+			}
+
+			// Remove old child.
+			child.parent = nil
+			delete(n.bucket.nodes, child.pgid)
+			child.free()
+		}
+
+		return
+	}
+
+	// If node has no keys then just remove it.
+	if n.numChildren() == 0 {
+		n.parent.del(n.key)
+		n.parent.removeChild(n)
+		delete(n.bucket.nodes, n.pgid)
+		n.free()
+		n.parent.rebalance()
+		return
+	}
+
+	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
+
+	// Destination node is right sibling if idx == 0, otherwise left sibling.
+	var target *node
+	var useNextSibling = (n.parent.childIndex(n) == 0)
+	if useNextSibling {
+		target = n.nextSibling()
+	} else {
+		target = n.prevSibling()
+	}
+
+	// If target node has extra nodes then just move one over.
+	if target.numChildren() > target.minKeys() {
+		if useNextSibling {
+			// Reparent and move node.
+			if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok {
+				child.parent.removeChild(child)
+				child.parent = n
+				child.parent.children = append(child.parent.children, child)
+			}
+			n.inodes = append(n.inodes, target.inodes[0])
+			target.inodes = target.inodes[1:]
+
+			// Update target key on parent.
+			target.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0)
+			target.key = target.inodes[0].key
+			_assert(len(target.key) > 0, "rebalance(1): zero-length node key")
+		} else {
+			// Reparent and move node.
+			if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok {
+				child.parent.removeChild(child)
+				child.parent = n
+				child.parent.children = append(child.parent.children, child)
+			}
+			n.inodes = append(n.inodes, inode{})
+			copy(n.inodes[1:], n.inodes)
+			n.inodes[0] = target.inodes[len(target.inodes)-1]
+			target.inodes = target.inodes[:len(target.inodes)-1]
+		}
+
+		// Update parent key for node.
+		n.parent.put(n.key, n.inodes[0].key, nil, n.pgid, 0)
+		n.key = n.inodes[0].key
+		_assert(len(n.key) > 0, "rebalance(2): zero-length node key")
+
+		return
+	}
+
+	// If both this node and the target node are too small then merge them.
+	if useNextSibling {
+		// Reparent all child nodes being moved.
+		for _, inode := range target.inodes {
+			if child, ok := n.bucket.nodes[inode.pgid]; ok {
+				child.parent.removeChild(child)
+				child.parent = n
+				child.parent.children = append(child.parent.children, child)
+			}
+		}
+
+		// Copy over inodes from target and remove target.
+		n.inodes = append(n.inodes, target.inodes...)
+		n.parent.del(target.key)
+		n.parent.removeChild(target)
+		delete(n.bucket.nodes, target.pgid)
+		target.free()
+	} else {
+		// Reparent all child nodes being moved.
+		for _, inode := range n.inodes {
+			if child, ok := n.bucket.nodes[inode.pgid]; ok {
+				child.parent.removeChild(child)
+				child.parent = target
+				child.parent.children = append(child.parent.children, child)
+			}
+		}
+
+		// Copy over inodes to target and remove node.
+		target.inodes = append(target.inodes, n.inodes...)
+		n.parent.del(n.key)
+		n.parent.removeChild(n)
+		delete(n.bucket.nodes, n.pgid)
+		n.free()
+	}
+
+	// Either this node or the target node was deleted from the parent so rebalance it.
+	n.parent.rebalance()
+}
+
+// removes a node from the list of in-memory children.
+// This does not affect the inodes.
+func (n *node) removeChild(target *node) {
+	for i, child := range n.children {
+		if child == target {
+			n.children = append(n.children[:i], n.children[i+1:]...)
+			return
+		}
+	}
+}
+
+// dereference causes the node to copy all its inode key/value references to heap memory.
+// This is required when the mmap is reallocated so inodes are not pointing to stale data.
+func (n *node) dereference() {
+	if n.key != nil {
+		key := make([]byte, len(n.key))
+		copy(key, n.key)
+		n.key = key
+		_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
+	}
+
+	for i := range n.inodes {
+		inode := &n.inodes[i]
+
+		key := make([]byte, len(inode.key))
+		copy(key, inode.key)
+		inode.key = key
+		_assert(len(inode.key) > 0, "dereference: zero-length inode key")
+
+		value := make([]byte, len(inode.value))
+		copy(value, inode.value)
+		inode.value = value
+	}
+
+	// Recursively dereference children.
+	for _, child := range n.children {
+		child.dereference()
+	}
+
+	// Update statistics.
+	n.bucket.tx.stats.NodeDeref++
+}
+
+// free adds the node's underlying page to the freelist.
+func (n *node) free() {
+	if n.pgid != 0 {
+		n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
+		n.pgid = 0
+	}
+}
+
+// dump writes the contents of the node to STDERR for debugging purposes.
+/*
+func (n *node) dump() {
+	// Write node header.
+	var typ = "branch"
+	if n.isLeaf {
+		typ = "leaf"
+	}
+	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
+
+	// Write out abbreviated version of each item.
+	for _, item := range n.inodes {
+		if n.isLeaf {
+			if item.flags&bucketLeafFlag != 0 {
+				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
+				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
+			} else {
+				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
+			}
+		} else {
+			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
+		}
+	}
+	warn("")
+}
+*/
+
+type nodes []*node
+
+func (s nodes) Len() int           { return len(s) }
+func (s nodes) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
+
+// inode represents an internal node inside of a node.
+// It can be used to point to elements in a page or point
+// to an element which hasn't been added to a page yet.
+type inode struct {
+	flags uint32
+	pgid  pgid
+	key   []byte
+	value []byte
+}
+
+type inodes []inode

+ 156 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/node_test.go

@@ -0,0 +1,156 @@
+package bolt
+
+import (
+	"testing"
+	"unsafe"
+)
+
+// Ensure that a node can insert a key/value.
+func TestNode_put(t *testing.T) {
+	n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{meta: &meta{pgid: 1}}}}
+	n.put([]byte("baz"), []byte("baz"), []byte("2"), 0, 0)
+	n.put([]byte("foo"), []byte("foo"), []byte("0"), 0, 0)
+	n.put([]byte("bar"), []byte("bar"), []byte("1"), 0, 0)
+	n.put([]byte("foo"), []byte("foo"), []byte("3"), 0, leafPageFlag)
+
+	if len(n.inodes) != 3 {
+		t.Fatalf("exp=3; got=%d", len(n.inodes))
+	}
+	if k, v := n.inodes[0].key, n.inodes[0].value; string(k) != "bar" || string(v) != "1" {
+		t.Fatalf("exp=<bar,1>; got=<%s,%s>", k, v)
+	}
+	if k, v := n.inodes[1].key, n.inodes[1].value; string(k) != "baz" || string(v) != "2" {
+		t.Fatalf("exp=<baz,2>; got=<%s,%s>", k, v)
+	}
+	if k, v := n.inodes[2].key, n.inodes[2].value; string(k) != "foo" || string(v) != "3" {
+		t.Fatalf("exp=<foo,3>; got=<%s,%s>", k, v)
+	}
+	if n.inodes[2].flags != uint32(leafPageFlag) {
+		t.Fatalf("not a leaf: %d", n.inodes[2].flags)
+	}
+}
+
+// Ensure that a node can deserialize from a leaf page.
+func TestNode_read_LeafPage(t *testing.T) {
+	// Create a page.
+	var buf [4096]byte
+	page := (*page)(unsafe.Pointer(&buf[0]))
+	page.flags = leafPageFlag
+	page.count = 2
+
+	// Insert 2 elements at the beginning. sizeof(leafPageElement) == 16
+	nodes := (*[3]leafPageElement)(unsafe.Pointer(&page.ptr))
+	nodes[0] = leafPageElement{flags: 0, pos: 32, ksize: 3, vsize: 4}  // pos = sizeof(leafPageElement) * 2
+	nodes[1] = leafPageElement{flags: 0, pos: 23, ksize: 10, vsize: 3} // pos = sizeof(leafPageElement) + 3 + 4
+
+	// Write data for the nodes at the end.
+	data := (*[4096]byte)(unsafe.Pointer(&nodes[2]))
+	copy(data[:], []byte("barfooz"))
+	copy(data[7:], []byte("helloworldbye"))
+
+	// Deserialize page into a leaf.
+	n := &node{}
+	n.read(page)
+
+	// Check that there are two inodes with correct data.
+	if !n.isLeaf {
+		t.Fatal("expected leaf")
+	}
+	if len(n.inodes) != 2 {
+		t.Fatalf("exp=2; got=%d", len(n.inodes))
+	}
+	if k, v := n.inodes[0].key, n.inodes[0].value; string(k) != "bar" || string(v) != "fooz" {
+		t.Fatalf("exp=<bar,fooz>; got=<%s,%s>", k, v)
+	}
+	if k, v := n.inodes[1].key, n.inodes[1].value; string(k) != "helloworld" || string(v) != "bye" {
+		t.Fatalf("exp=<helloworld,bye>; got=<%s,%s>", k, v)
+	}
+}
+
+// Ensure that a node can serialize into a leaf page.
+func TestNode_write_LeafPage(t *testing.T) {
+	// Create a node.
+	n := &node{isLeaf: true, inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}}
+	n.put([]byte("susy"), []byte("susy"), []byte("que"), 0, 0)
+	n.put([]byte("ricki"), []byte("ricki"), []byte("lake"), 0, 0)
+	n.put([]byte("john"), []byte("john"), []byte("johnson"), 0, 0)
+
+	// Write it to a page.
+	var buf [4096]byte
+	p := (*page)(unsafe.Pointer(&buf[0]))
+	n.write(p)
+
+	// Read the page back in.
+	n2 := &node{}
+	n2.read(p)
+
+	// Check that the two pages are the same.
+	if len(n2.inodes) != 3 {
+		t.Fatalf("exp=3; got=%d", len(n2.inodes))
+	}
+	if k, v := n2.inodes[0].key, n2.inodes[0].value; string(k) != "john" || string(v) != "johnson" {
+		t.Fatalf("exp=<john,johnson>; got=<%s,%s>", k, v)
+	}
+	if k, v := n2.inodes[1].key, n2.inodes[1].value; string(k) != "ricki" || string(v) != "lake" {
+		t.Fatalf("exp=<ricki,lake>; got=<%s,%s>", k, v)
+	}
+	if k, v := n2.inodes[2].key, n2.inodes[2].value; string(k) != "susy" || string(v) != "que" {
+		t.Fatalf("exp=<susy,que>; got=<%s,%s>", k, v)
+	}
+}
+
+// Ensure that a node can split into appropriate subgroups.
+func TestNode_split(t *testing.T) {
+	// Create a node.
+	n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}}
+	n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000004"), []byte("00000004"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0)
+
+	// Split between 2 & 3.
+	n.split(100)
+
+	var parent = n.parent
+	if len(parent.children) != 2 {
+		t.Fatalf("exp=2; got=%d", len(parent.children))
+	}
+	if len(parent.children[0].inodes) != 2 {
+		t.Fatalf("exp=2; got=%d", len(parent.children[0].inodes))
+	}
+	if len(parent.children[1].inodes) != 3 {
+		t.Fatalf("exp=3; got=%d", len(parent.children[1].inodes))
+	}
+}
+
+// Ensure that a page with the minimum number of inodes just returns a single node.
+func TestNode_split_MinKeys(t *testing.T) {
+	// Create a node.
+	n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}}
+	n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
+
+	// Split.
+	n.split(20)
+	if n.parent != nil {
+		t.Fatalf("expected nil parent")
+	}
+}
+
+// Ensure that a node that has keys that all fit on a page just returns one leaf.
+func TestNode_split_SinglePage(t *testing.T) {
+	// Create a node.
+	n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}}
+	n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000004"), []byte("00000004"), []byte("0123456701234567"), 0, 0)
+	n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0)
+
+	// Split.
+	n.split(4096)
+	if n.parent != nil {
+		t.Fatalf("expected nil parent")
+	}
+}

+ 134 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/page.go

@@ -0,0 +1,134 @@
+package bolt
+
+import (
+	"fmt"
+	"os"
+	"unsafe"
+)
+
+const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr))
+
+const minKeysPerPage = 2
+
+const branchPageElementSize = int(unsafe.Sizeof(branchPageElement{}))
+const leafPageElementSize = int(unsafe.Sizeof(leafPageElement{}))
+
+const (
+	branchPageFlag   = 0x01
+	leafPageFlag     = 0x02
+	metaPageFlag     = 0x04
+	freelistPageFlag = 0x10
+)
+
+const (
+	bucketLeafFlag = 0x01
+)
+
+type pgid uint64
+
+type page struct {
+	id       pgid
+	flags    uint16
+	count    uint16
+	overflow uint32
+	ptr      uintptr
+}
+
+// typ returns a human readable page type string used for debugging.
+func (p *page) typ() string {
+	if (p.flags & branchPageFlag) != 0 {
+		return "branch"
+	} else if (p.flags & leafPageFlag) != 0 {
+		return "leaf"
+	} else if (p.flags & metaPageFlag) != 0 {
+		return "meta"
+	} else if (p.flags & freelistPageFlag) != 0 {
+		return "freelist"
+	}
+	return fmt.Sprintf("unknown<%02x>", p.flags)
+}
+
+// meta returns a pointer to the metadata section of the page.
+func (p *page) meta() *meta {
+	return (*meta)(unsafe.Pointer(&p.ptr))
+}
+
+// leafPageElement retrieves the leaf node by index
+func (p *page) leafPageElement(index uint16) *leafPageElement {
+	n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index]
+	return n
+}
+
+// leafPageElements retrieves a list of leaf nodes.
+func (p *page) leafPageElements() []leafPageElement {
+	return ((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[:]
+}
+
+// branchPageElement retrieves the branch node by index
+func (p *page) branchPageElement(index uint16) *branchPageElement {
+	return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index]
+}
+
+// branchPageElements retrieves a list of branch nodes.
+func (p *page) branchPageElements() []branchPageElement {
+	return ((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[:]
+}
+
+// dump writes n bytes of the page to STDERR as hex output.
+func (p *page) hexdump(n int) {
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n]
+	fmt.Fprintf(os.Stderr, "%x\n", buf)
+}
+
+type pages []*page
+
+func (s pages) Len() int           { return len(s) }
+func (s pages) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s pages) Less(i, j int) bool { return s[i].id < s[j].id }
+
+// branchPageElement represents a node on a branch page.
+type branchPageElement struct {
+	pos   uint32
+	ksize uint32
+	pgid  pgid
+}
+
+// key returns a byte slice of the node key.
+func (n *branchPageElement) key() []byte {
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
+	return buf[n.pos : n.pos+n.ksize]
+}
+
+// leafPageElement represents a node on a leaf page.
+type leafPageElement struct {
+	flags uint32
+	pos   uint32
+	ksize uint32
+	vsize uint32
+}
+
+// key returns a byte slice of the node key.
+func (n *leafPageElement) key() []byte {
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
+	return buf[n.pos : n.pos+n.ksize]
+}
+
+// value returns a byte slice of the node value.
+func (n *leafPageElement) value() []byte {
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
+	return buf[n.pos+n.ksize : n.pos+n.ksize+n.vsize]
+}
+
+// PageInfo represents human readable information about a page.
+type PageInfo struct {
+	ID            int
+	Type          string
+	Count         int
+	OverflowCount int
+}
+
+type pgids []pgid
+
+func (s pgids) Len() int           { return len(s) }
+func (s pgids) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s pgids) Less(i, j int) bool { return s[i] < s[j] }

+ 29 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/page_test.go

@@ -0,0 +1,29 @@
+package bolt
+
+import (
+	"testing"
+)
+
+// Ensure that the page type can be returned in human readable format.
+func TestPage_typ(t *testing.T) {
+	if typ := (&page{flags: branchPageFlag}).typ(); typ != "branch" {
+		t.Fatalf("exp=branch; got=%v", typ)
+	}
+	if typ := (&page{flags: leafPageFlag}).typ(); typ != "leaf" {
+		t.Fatalf("exp=leaf; got=%v", typ)
+	}
+	if typ := (&page{flags: metaPageFlag}).typ(); typ != "meta" {
+		t.Fatalf("exp=meta; got=%v", typ)
+	}
+	if typ := (&page{flags: freelistPageFlag}).typ(); typ != "freelist" {
+		t.Fatalf("exp=freelist; got=%v", typ)
+	}
+	if typ := (&page{flags: 20000}).typ(); typ != "unknown<4e20>" {
+		t.Fatalf("exp=unknown<4e20>; got=%v", typ)
+	}
+}
+
+// Ensure that the hexdump debugging function doesn't blow up.
+func TestPage_dump(t *testing.T) {
+	(&page{id: 256}).hexdump(16)
+}

+ 79 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/quick_test.go

@@ -0,0 +1,79 @@
+package bolt_test
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"math/rand"
+	"os"
+	"reflect"
+	"testing/quick"
+	"time"
+)
+
+// testing/quick defaults to 5 iterations and a random seed.
+// You can override these settings from the command line:
+//
+//   -quick.count     The number of iterations to perform.
+//   -quick.seed      The seed to use for randomizing.
+//   -quick.maxitems  The maximum number of items to insert into a DB.
+//   -quick.maxksize  The maximum size of a key.
+//   -quick.maxvsize  The maximum size of a value.
+//
+
+var qcount, qseed, qmaxitems, qmaxksize, qmaxvsize int
+
+func init() {
+	flag.IntVar(&qcount, "quick.count", 5, "")
+	flag.IntVar(&qseed, "quick.seed", int(time.Now().UnixNano())%100000, "")
+	flag.IntVar(&qmaxitems, "quick.maxitems", 1000, "")
+	flag.IntVar(&qmaxksize, "quick.maxksize", 1024, "")
+	flag.IntVar(&qmaxvsize, "quick.maxvsize", 1024, "")
+	flag.Parse()
+	fmt.Fprintln(os.Stderr, "seed:", qseed)
+	fmt.Fprintf(os.Stderr, "quick settings: count=%v, items=%v, ksize=%v, vsize=%v\n", qcount, qmaxitems, qmaxksize, qmaxvsize)
+}
+
+func qconfig() *quick.Config {
+	return &quick.Config{
+		MaxCount: qcount,
+		Rand:     rand.New(rand.NewSource(int64(qseed))),
+	}
+}
+
+type testdata []testdataitem
+
+func (t testdata) Len() int           { return len(t) }
+func (t testdata) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
+func (t testdata) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) == -1 }
+
+func (t testdata) Generate(rand *rand.Rand, size int) reflect.Value {
+	n := rand.Intn(qmaxitems-1) + 1
+	items := make(testdata, n)
+	for i := 0; i < n; i++ {
+		item := &items[i]
+		item.Key = randByteSlice(rand, 1, qmaxksize)
+		item.Value = randByteSlice(rand, 0, qmaxvsize)
+	}
+	return reflect.ValueOf(items)
+}
+
+type revtestdata []testdataitem
+
+func (t revtestdata) Len() int           { return len(t) }
+func (t revtestdata) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
+func (t revtestdata) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) == 1 }
+
+type testdataitem struct {
+	Key   []byte
+	Value []byte
+}
+
+func randByteSlice(rand *rand.Rand, minSize, maxSize int) []byte {
+	n := rand.Intn(maxSize-minSize) + minSize
+	b := make([]byte, n)
+	for i := 0; i < n; i++ {
+		b[i] = byte(rand.Intn(255))
+	}
+	return b
+}

+ 327 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/simulation_test.go

@@ -0,0 +1,327 @@
+package bolt_test
+
+import (
+	"bytes"
+	"fmt"
+	"math/rand"
+	"sync"
+	"testing"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+func TestSimulate_1op_1p(t *testing.T)     { testSimulate(t, 100, 1) }
+func TestSimulate_10op_1p(t *testing.T)    { testSimulate(t, 10, 1) }
+func TestSimulate_100op_1p(t *testing.T)   { testSimulate(t, 100, 1) }
+func TestSimulate_1000op_1p(t *testing.T)  { testSimulate(t, 1000, 1) }
+func TestSimulate_10000op_1p(t *testing.T) { testSimulate(t, 10000, 1) }
+
+func TestSimulate_10op_10p(t *testing.T)    { testSimulate(t, 10, 10) }
+func TestSimulate_100op_10p(t *testing.T)   { testSimulate(t, 100, 10) }
+func TestSimulate_1000op_10p(t *testing.T)  { testSimulate(t, 1000, 10) }
+func TestSimulate_10000op_10p(t *testing.T) { testSimulate(t, 10000, 10) }
+
+func TestSimulate_100op_100p(t *testing.T)   { testSimulate(t, 100, 100) }
+func TestSimulate_1000op_100p(t *testing.T)  { testSimulate(t, 1000, 100) }
+func TestSimulate_10000op_100p(t *testing.T) { testSimulate(t, 10000, 100) }
+
+func TestSimulate_10000op_1000p(t *testing.T) { testSimulate(t, 10000, 1000) }
+
+// Randomly generate operations on a given database with multiple clients to ensure consistency and thread safety.
+func testSimulate(t *testing.T, threadCount, parallelism int) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+
+	rand.Seed(int64(qseed))
+
+	// A list of operations that readers and writers can perform.
+	var readerHandlers = []simulateHandler{simulateGetHandler}
+	var writerHandlers = []simulateHandler{simulateGetHandler, simulatePutHandler}
+
+	var versions = make(map[int]*QuickDB)
+	versions[1] = NewQuickDB()
+
+	db := NewTestDB()
+	defer db.Close()
+
+	var mutex sync.Mutex
+
+	// Run n threads in parallel, each with their own operation.
+	var wg sync.WaitGroup
+	var threads = make(chan bool, parallelism)
+	var i int
+	for {
+		threads <- true
+		wg.Add(1)
+		writable := ((rand.Int() % 100) < 20) // 20% writers
+
+		// Choose an operation to execute.
+		var handler simulateHandler
+		if writable {
+			handler = writerHandlers[rand.Intn(len(writerHandlers))]
+		} else {
+			handler = readerHandlers[rand.Intn(len(readerHandlers))]
+		}
+
+		// Execute a thread for the given operation.
+		go func(writable bool, handler simulateHandler) {
+			defer wg.Done()
+
+			// Start transaction.
+			tx, err := db.Begin(writable)
+			if err != nil {
+				t.Fatal("tx begin: ", err)
+			}
+
+			// Obtain current state of the dataset.
+			mutex.Lock()
+			var qdb = versions[tx.ID()]
+			if writable {
+				qdb = versions[tx.ID()-1].Copy()
+			}
+			mutex.Unlock()
+
+			// Make sure we commit/rollback the tx at the end and update the state.
+			if writable {
+				defer func() {
+					mutex.Lock()
+					versions[tx.ID()] = qdb
+					mutex.Unlock()
+
+					ok(t, tx.Commit())
+				}()
+			} else {
+				defer tx.Rollback()
+			}
+
+			// Ignore operation if we don't have data yet.
+			if qdb == nil {
+				return
+			}
+
+			// Execute handler.
+			handler(tx, qdb)
+
+			// Release a thread back to the scheduling loop.
+			<-threads
+		}(writable, handler)
+
+		i++
+		if i > threadCount {
+			break
+		}
+	}
+
+	// Wait until all threads are done.
+	wg.Wait()
+}
+
+type simulateHandler func(tx *bolt.Tx, qdb *QuickDB)
+
+// Retrieves a key from the database and verifies that it is what is expected.
+func simulateGetHandler(tx *bolt.Tx, qdb *QuickDB) {
+	// Randomly retrieve an existing exist.
+	keys := qdb.Rand()
+	if len(keys) == 0 {
+		return
+	}
+
+	// Retrieve root bucket.
+	b := tx.Bucket(keys[0])
+	if b == nil {
+		panic(fmt.Sprintf("bucket[0] expected: %08x\n", trunc(keys[0], 4)))
+	}
+
+	// Drill into nested buckets.
+	for _, key := range keys[1 : len(keys)-1] {
+		b = b.Bucket(key)
+		if b == nil {
+			panic(fmt.Sprintf("bucket[n] expected: %v -> %v\n", keys, key))
+		}
+	}
+
+	// Verify key/value on the final bucket.
+	expected := qdb.Get(keys)
+	actual := b.Get(keys[len(keys)-1])
+	if !bytes.Equal(actual, expected) {
+		fmt.Println("=== EXPECTED ===")
+		fmt.Println(expected)
+		fmt.Println("=== ACTUAL ===")
+		fmt.Println(actual)
+		fmt.Println("=== END ===")
+		panic("value mismatch")
+	}
+}
+
+// Inserts a key into the database.
+func simulatePutHandler(tx *bolt.Tx, qdb *QuickDB) {
+	var err error
+	keys, value := randKeys(), randValue()
+
+	// Retrieve root bucket.
+	b := tx.Bucket(keys[0])
+	if b == nil {
+		b, err = tx.CreateBucket(keys[0])
+		if err != nil {
+			panic("create bucket: " + err.Error())
+		}
+	}
+
+	// Create nested buckets, if necessary.
+	for _, key := range keys[1 : len(keys)-1] {
+		child := b.Bucket(key)
+		if child != nil {
+			b = child
+		} else {
+			b, err = b.CreateBucket(key)
+			if err != nil {
+				panic("create bucket: " + err.Error())
+			}
+		}
+	}
+
+	// Insert into database.
+	if err := b.Put(keys[len(keys)-1], value); err != nil {
+		panic("put: " + err.Error())
+	}
+
+	// Insert into in-memory database.
+	qdb.Put(keys, value)
+}
+
+// QuickDB is an in-memory database that replicates the functionality of the
+// Bolt DB type except that it is entirely in-memory. It is meant for testing
+// that the Bolt database is consistent.
+type QuickDB struct {
+	sync.RWMutex
+	m map[string]interface{}
+}
+
+// NewQuickDB returns an instance of QuickDB.
+func NewQuickDB() *QuickDB {
+	return &QuickDB{m: make(map[string]interface{})}
+}
+
+// Get retrieves the value at a key path.
+func (db *QuickDB) Get(keys [][]byte) []byte {
+	db.RLock()
+	defer db.RUnlock()
+
+	m := db.m
+	for _, key := range keys[:len(keys)-1] {
+		value := m[string(key)]
+		if value == nil {
+			return nil
+		}
+		switch value := value.(type) {
+		case map[string]interface{}:
+			m = value
+		case []byte:
+			return nil
+		}
+	}
+
+	// Only return if it's a simple value.
+	if value, ok := m[string(keys[len(keys)-1])].([]byte); ok {
+		return value
+	}
+	return nil
+}
+
+// Put inserts a value into a key path.
+func (db *QuickDB) Put(keys [][]byte, value []byte) {
+	db.Lock()
+	defer db.Unlock()
+
+	// Build buckets all the way down the key path.
+	m := db.m
+	for _, key := range keys[:len(keys)-1] {
+		if _, ok := m[string(key)].([]byte); ok {
+			return // Keypath intersects with a simple value. Do nothing.
+		}
+
+		if m[string(key)] == nil {
+			m[string(key)] = make(map[string]interface{})
+		}
+		m = m[string(key)].(map[string]interface{})
+	}
+
+	// Insert value into the last key.
+	m[string(keys[len(keys)-1])] = value
+}
+
+// Rand returns a random key path that points to a simple value.
+func (db *QuickDB) Rand() [][]byte {
+	db.RLock()
+	defer db.RUnlock()
+	if len(db.m) == 0 {
+		return nil
+	}
+	var keys [][]byte
+	db.rand(db.m, &keys)
+	return keys
+}
+
+func (db *QuickDB) rand(m map[string]interface{}, keys *[][]byte) {
+	i, index := 0, rand.Intn(len(m))
+	for k, v := range m {
+		if i == index {
+			*keys = append(*keys, []byte(k))
+			if v, ok := v.(map[string]interface{}); ok {
+				db.rand(v, keys)
+			}
+			return
+		}
+		i++
+	}
+	panic("quickdb rand: out-of-range")
+}
+
+// Copy copies the entire database.
+func (db *QuickDB) Copy() *QuickDB {
+	db.RLock()
+	defer db.RUnlock()
+	return &QuickDB{m: db.copy(db.m)}
+}
+
+func (db *QuickDB) copy(m map[string]interface{}) map[string]interface{} {
+	clone := make(map[string]interface{}, len(m))
+	for k, v := range m {
+		switch v := v.(type) {
+		case map[string]interface{}:
+			clone[k] = db.copy(v)
+		default:
+			clone[k] = v
+		}
+	}
+	return clone
+}
+
+func randKey() []byte {
+	var min, max = 1, 1024
+	n := rand.Intn(max-min) + min
+	b := make([]byte, n)
+	for i := 0; i < n; i++ {
+		b[i] = byte(rand.Intn(255))
+	}
+	return b
+}
+
+func randKeys() [][]byte {
+	var keys [][]byte
+	var count = rand.Intn(2) + 2
+	for i := 0; i < count; i++ {
+		keys = append(keys, randKey())
+	}
+	return keys
+}
+
+func randValue() []byte {
+	n := rand.Intn(8192)
+	b := make([]byte, n)
+	for i := 0; i < n; i++ {
+		b[i] = byte(rand.Intn(255))
+	}
+	return b
+}

+ 585 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/tx.go

@@ -0,0 +1,585 @@
+package bolt
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"time"
+	"unsafe"
+)
+
+// txid represents the internal transaction identifier.
+type txid uint64
+
+// Tx represents a read-only or read/write transaction on the database.
+// Read-only transactions can be used for retrieving values for keys and creating cursors.
+// Read/write transactions can create and remove buckets and create and remove keys.
+//
+// IMPORTANT: You must commit or rollback transactions when you are done with
+// them. Pages can not be reclaimed by the writer until no more transactions
+// are using them. A long running read transaction can cause the database to
+// quickly grow.
+type Tx struct {
+	writable       bool
+	managed        bool
+	db             *DB
+	meta           *meta
+	root           Bucket
+	pages          map[pgid]*page
+	stats          TxStats
+	commitHandlers []func()
+}
+
+// init initializes the transaction.
+func (tx *Tx) init(db *DB) {
+	tx.db = db
+	tx.pages = nil
+
+	// Copy the meta page since it can be changed by the writer.
+	tx.meta = &meta{}
+	db.meta().copy(tx.meta)
+
+	// Copy over the root bucket.
+	tx.root = newBucket(tx)
+	tx.root.bucket = &bucket{}
+	*tx.root.bucket = tx.meta.root
+
+	// Increment the transaction id and add a page cache for writable transactions.
+	if tx.writable {
+		tx.pages = make(map[pgid]*page)
+		tx.meta.txid += txid(1)
+	}
+}
+
+// ID returns the transaction id.
+func (tx *Tx) ID() int {
+	return int(tx.meta.txid)
+}
+
+// DB returns a reference to the database that created the transaction.
+func (tx *Tx) DB() *DB {
+	return tx.db
+}
+
+// Size returns current database size in bytes as seen by this transaction.
+func (tx *Tx) Size() int64 {
+	return int64(tx.meta.pgid) * int64(tx.db.pageSize)
+}
+
+// Writable returns whether the transaction can perform write operations.
+func (tx *Tx) Writable() bool {
+	return tx.writable
+}
+
+// Cursor creates a cursor associated with the root bucket.
+// All items in the cursor will return a nil value because all root bucket keys point to buckets.
+// The cursor is only valid as long as the transaction is open.
+// Do not use a cursor after the transaction is closed.
+func (tx *Tx) Cursor() *Cursor {
+	return tx.root.Cursor()
+}
+
+// Stats retrieves a copy of the current transaction statistics.
+func (tx *Tx) Stats() TxStats {
+	return tx.stats
+}
+
+// Bucket retrieves a bucket by name.
+// Returns nil if the bucket does not exist.
+func (tx *Tx) Bucket(name []byte) *Bucket {
+	return tx.root.Bucket(name)
+}
+
+// CreateBucket creates a new bucket.
+// Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long.
+func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) {
+	return tx.root.CreateBucket(name)
+}
+
+// CreateBucketIfNotExists creates a new bucket if it doesn't already exist.
+// Returns an error if the bucket name is blank, or if the bucket name is too long.
+func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) {
+	return tx.root.CreateBucketIfNotExists(name)
+}
+
+// DeleteBucket deletes a bucket.
+// Returns an error if the bucket cannot be found or if the key represents a non-bucket value.
+func (tx *Tx) DeleteBucket(name []byte) error {
+	return tx.root.DeleteBucket(name)
+}
+
+// ForEach executes a function for each bucket in the root.
+// If the provided function returns an error then the iteration is stopped and
+// the error is returned to the caller.
+func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error {
+	return tx.root.ForEach(func(k, v []byte) error {
+		if err := fn(k, tx.root.Bucket(k)); err != nil {
+			return err
+		}
+		return nil
+	})
+}
+
+// OnCommit adds a handler function to be executed after the transaction successfully commits.
+func (tx *Tx) OnCommit(fn func()) {
+	tx.commitHandlers = append(tx.commitHandlers, fn)
+}
+
+// Commit writes all changes to disk and updates the meta page.
+// Returns an error if a disk write error occurs.
+func (tx *Tx) Commit() error {
+	_assert(!tx.managed, "managed tx commit not allowed")
+	if tx.db == nil {
+		return ErrTxClosed
+	} else if !tx.writable {
+		return ErrTxNotWritable
+	}
+
+	// TODO(benbjohnson): Use vectorized I/O to write out dirty pages.
+
+	// Rebalance nodes which have had deletions.
+	var startTime = time.Now()
+	tx.root.rebalance()
+	if tx.stats.Rebalance > 0 {
+		tx.stats.RebalanceTime += time.Since(startTime)
+	}
+
+	// spill data onto dirty pages.
+	startTime = time.Now()
+	if err := tx.root.spill(); err != nil {
+		tx.rollback()
+		return err
+	}
+	tx.stats.SpillTime += time.Since(startTime)
+
+	// Free the old root bucket.
+	tx.meta.root.root = tx.root.root
+
+	// Free the freelist and allocate new pages for it. This will overestimate
+	// the size of the freelist but not underestimate the size (which would be bad).
+	tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
+	p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
+	if err != nil {
+		tx.rollback()
+		return err
+	}
+	if err := tx.db.freelist.write(p); err != nil {
+		tx.rollback()
+		return err
+	}
+	tx.meta.freelist = p.id
+
+	// Write dirty pages to disk.
+	startTime = time.Now()
+	if err := tx.write(); err != nil {
+		tx.rollback()
+		return err
+	}
+
+	// If strict mode is enabled then perform a consistency check.
+	// Only the first consistency error is reported in the panic.
+	if tx.db.StrictMode {
+		if err, ok := <-tx.Check(); ok {
+			panic("check fail: " + err.Error())
+		}
+	}
+
+	// Write meta to disk.
+	if err := tx.writeMeta(); err != nil {
+		tx.rollback()
+		return err
+	}
+	tx.stats.WriteTime += time.Since(startTime)
+
+	// Finalize the transaction.
+	tx.close()
+
+	// Execute commit handlers now that the locks have been removed.
+	for _, fn := range tx.commitHandlers {
+		fn()
+	}
+
+	return nil
+}
+
+// Rollback closes the transaction and ignores all previous updates.
+func (tx *Tx) Rollback() error {
+	_assert(!tx.managed, "managed tx rollback not allowed")
+	if tx.db == nil {
+		return ErrTxClosed
+	}
+	tx.rollback()
+	return nil
+}
+
+func (tx *Tx) rollback() {
+	if tx.db == nil {
+		return
+	}
+	if tx.writable {
+		tx.db.freelist.rollback(tx.meta.txid)
+		tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist))
+	}
+	tx.close()
+}
+
+func (tx *Tx) close() {
+	if tx.db == nil {
+		return
+	}
+	if tx.writable {
+		// Grab freelist stats.
+		var freelistFreeN = tx.db.freelist.free_count()
+		var freelistPendingN = tx.db.freelist.pending_count()
+		var freelistAlloc = tx.db.freelist.size()
+
+		// Remove writer lock.
+		tx.db.rwlock.Unlock()
+
+		// Merge statistics.
+		tx.db.statlock.Lock()
+		tx.db.stats.FreePageN = freelistFreeN
+		tx.db.stats.PendingPageN = freelistPendingN
+		tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize
+		tx.db.stats.FreelistInuse = freelistAlloc
+		tx.db.stats.TxStats.add(&tx.stats)
+		tx.db.statlock.Unlock()
+	} else {
+		tx.db.removeTx(tx)
+	}
+	tx.db = nil
+}
+
+// Copy writes the entire database to a writer.
+// This function exists for backwards compatibility. Use WriteTo() in
+func (tx *Tx) Copy(w io.Writer) error {
+	_, err := tx.WriteTo(w)
+	return err
+}
+
+// WriteTo writes the entire database to a writer.
+// If err == nil then exactly tx.Size() bytes will be written into the writer.
+func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
+	// Attempt to open reader directly.
+	var f *os.File
+	if f, err = os.OpenFile(tx.db.path, os.O_RDONLY|odirect, 0); err != nil {
+		// Fallback to a regular open if that doesn't work.
+		if f, err = os.OpenFile(tx.db.path, os.O_RDONLY, 0); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy the meta pages.
+	tx.db.metalock.Lock()
+	n, err = io.CopyN(w, f, int64(tx.db.pageSize*2))
+	tx.db.metalock.Unlock()
+	if err != nil {
+		_ = f.Close()
+		return n, fmt.Errorf("meta copy: %s", err)
+	}
+
+	// Copy data pages.
+	wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2))
+	n += wn
+	if err != nil {
+		_ = f.Close()
+		return n, err
+	}
+
+	return n, f.Close()
+}
+
+// CopyFile copies the entire database to file at the given path.
+// A reader transaction is maintained during the copy so it is safe to continue
+// using the database while a copy is in progress.
+func (tx *Tx) CopyFile(path string, mode os.FileMode) error {
+	f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode)
+	if err != nil {
+		return err
+	}
+
+	err = tx.Copy(f)
+	if err != nil {
+		_ = f.Close()
+		return err
+	}
+	return f.Close()
+}
+
+// Check performs several consistency checks on the database for this transaction.
+// An error is returned if any inconsistency is found.
+//
+// It can be safely run concurrently on a writable transaction. However, this
+// incurs a high cost for large databases and databases with a lot of subbuckets
+// because of caching. This overhead can be removed if running on a read-only
+// transaction, however, it is not safe to execute other writer transactions at
+// the same time.
+func (tx *Tx) Check() <-chan error {
+	ch := make(chan error)
+	go tx.check(ch)
+	return ch
+}
+
+func (tx *Tx) check(ch chan error) {
+	// Check if any pages are double freed.
+	freed := make(map[pgid]bool)
+	for _, id := range tx.db.freelist.all() {
+		if freed[id] {
+			ch <- fmt.Errorf("page %d: already freed", id)
+		}
+		freed[id] = true
+	}
+
+	// Track every reachable page.
+	reachable := make(map[pgid]*page)
+	reachable[0] = tx.page(0) // meta0
+	reachable[1] = tx.page(1) // meta1
+	for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ {
+		reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist)
+	}
+
+	// Recursively check buckets.
+	tx.checkBucket(&tx.root, reachable, freed, ch)
+
+	// Ensure all pages below high water mark are either reachable or freed.
+	for i := pgid(0); i < tx.meta.pgid; i++ {
+		_, isReachable := reachable[i]
+		if !isReachable && !freed[i] {
+			ch <- fmt.Errorf("page %d: unreachable unfreed", int(i))
+		}
+	}
+
+	// Close the channel to signal completion.
+	close(ch)
+}
+
+func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, ch chan error) {
+	// Ignore inline buckets.
+	if b.root == 0 {
+		return
+	}
+
+	// Check every page used by this bucket.
+	b.tx.forEachPage(b.root, 0, func(p *page, _ int) {
+		if p.id > tx.meta.pgid {
+			ch <- fmt.Errorf("page %d: out of bounds: %d", int(p.id), int(b.tx.meta.pgid))
+		}
+
+		// Ensure each page is only referenced once.
+		for i := pgid(0); i <= pgid(p.overflow); i++ {
+			var id = p.id + i
+			if _, ok := reachable[id]; ok {
+				ch <- fmt.Errorf("page %d: multiple references", int(id))
+			}
+			reachable[id] = p
+		}
+
+		// We should only encounter un-freed leaf and branch pages.
+		if freed[p.id] {
+			ch <- fmt.Errorf("page %d: reachable freed", int(p.id))
+		} else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 {
+			ch <- fmt.Errorf("page %d: invalid type: %s", int(p.id), p.typ())
+		}
+	})
+
+	// Check each bucket within this bucket.
+	_ = b.ForEach(func(k, v []byte) error {
+		if child := b.Bucket(k); child != nil {
+			tx.checkBucket(child, reachable, freed, ch)
+		}
+		return nil
+	})
+}
+
+// allocate returns a contiguous block of memory starting at a given page.
+func (tx *Tx) allocate(count int) (*page, error) {
+	p, err := tx.db.allocate(count)
+	if err != nil {
+		return nil, err
+	}
+
+	// Save to our page cache.
+	tx.pages[p.id] = p
+
+	// Update statistics.
+	tx.stats.PageCount++
+	tx.stats.PageAlloc += count * tx.db.pageSize
+
+	return p, nil
+}
+
+// write writes any dirty pages to disk.
+func (tx *Tx) write() error {
+	// Sort pages by id.
+	pages := make(pages, 0, len(tx.pages))
+	for _, p := range tx.pages {
+		pages = append(pages, p)
+	}
+	sort.Sort(pages)
+
+	// Write pages to disk in order.
+	for _, p := range pages {
+		size := (int(p.overflow) + 1) * tx.db.pageSize
+		buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:size]
+		offset := int64(p.id) * int64(tx.db.pageSize)
+		if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
+			return err
+		}
+
+		// Update statistics.
+		tx.stats.Write++
+	}
+	if !tx.db.NoSync || IgnoreNoSync {
+		if err := fdatasync(tx.db); err != nil {
+			return err
+		}
+	}
+
+	// Clear out page cache.
+	tx.pages = make(map[pgid]*page)
+
+	return nil
+}
+
+// writeMeta writes the meta to the disk.
+func (tx *Tx) writeMeta() error {
+	// Create a temporary buffer for the meta page.
+	buf := make([]byte, tx.db.pageSize)
+	p := tx.db.pageInBuffer(buf, 0)
+	tx.meta.write(p)
+
+	// Write the meta page to file.
+	if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil {
+		return err
+	}
+	if !tx.db.NoSync || IgnoreNoSync {
+		if err := fdatasync(tx.db); err != nil {
+			return err
+		}
+	}
+
+	// Update statistics.
+	tx.stats.Write++
+
+	return nil
+}
+
+// page returns a reference to the page with a given id.
+// If page has been written to then a temporary bufferred page is returned.
+func (tx *Tx) page(id pgid) *page {
+	// Check the dirty pages first.
+	if tx.pages != nil {
+		if p, ok := tx.pages[id]; ok {
+			return p
+		}
+	}
+
+	// Otherwise return directly from the mmap.
+	return tx.db.page(id)
+}
+
+// forEachPage iterates over every page within a given page and executes a function.
+func (tx *Tx) forEachPage(pgid pgid, depth int, fn func(*page, int)) {
+	p := tx.page(pgid)
+
+	// Execute function.
+	fn(p, depth)
+
+	// Recursively loop over children.
+	if (p.flags & branchPageFlag) != 0 {
+		for i := 0; i < int(p.count); i++ {
+			elem := p.branchPageElement(uint16(i))
+			tx.forEachPage(elem.pgid, depth+1, fn)
+		}
+	}
+}
+
+// Page returns page information for a given page number.
+// This is only safe for concurrent use when used by a writable transaction.
+func (tx *Tx) Page(id int) (*PageInfo, error) {
+	if tx.db == nil {
+		return nil, ErrTxClosed
+	} else if pgid(id) >= tx.meta.pgid {
+		return nil, nil
+	}
+
+	// Build the page info.
+	p := tx.db.page(pgid(id))
+	info := &PageInfo{
+		ID:            id,
+		Count:         int(p.count),
+		OverflowCount: int(p.overflow),
+	}
+
+	// Determine the type (or if it's free).
+	if tx.db.freelist.freed(pgid(id)) {
+		info.Type = "free"
+	} else {
+		info.Type = p.typ()
+	}
+
+	return info, nil
+}
+
+// TxStats represents statistics about the actions performed by the transaction.
+type TxStats struct {
+	// Page statistics.
+	PageCount int // number of page allocations
+	PageAlloc int // total bytes allocated
+
+	// Cursor statistics.
+	CursorCount int // number of cursors created
+
+	// Node statistics
+	NodeCount int // number of node allocations
+	NodeDeref int // number of node dereferences
+
+	// Rebalance statistics.
+	Rebalance     int           // number of node rebalances
+	RebalanceTime time.Duration // total time spent rebalancing
+
+	// Split/Spill statistics.
+	Split     int           // number of nodes split
+	Spill     int           // number of nodes spilled
+	SpillTime time.Duration // total time spent spilling
+
+	// Write statistics.
+	Write     int           // number of writes performed
+	WriteTime time.Duration // total time spent writing to disk
+}
+
+func (s *TxStats) add(other *TxStats) {
+	s.PageCount += other.PageCount
+	s.PageAlloc += other.PageAlloc
+	s.CursorCount += other.CursorCount
+	s.NodeCount += other.NodeCount
+	s.NodeDeref += other.NodeDeref
+	s.Rebalance += other.Rebalance
+	s.RebalanceTime += other.RebalanceTime
+	s.Split += other.Split
+	s.Spill += other.Spill
+	s.SpillTime += other.SpillTime
+	s.Write += other.Write
+	s.WriteTime += other.WriteTime
+}
+
+// Sub calculates and returns the difference between two sets of transaction stats.
+// This is useful when obtaining stats at two different points and time and
+// you need the performance counters that occurred within that time span.
+func (s *TxStats) Sub(other *TxStats) TxStats {
+	var diff TxStats
+	diff.PageCount = s.PageCount - other.PageCount
+	diff.PageAlloc = s.PageAlloc - other.PageAlloc
+	diff.CursorCount = s.CursorCount - other.CursorCount
+	diff.NodeCount = s.NodeCount - other.NodeCount
+	diff.NodeDeref = s.NodeDeref - other.NodeDeref
+	diff.Rebalance = s.Rebalance - other.Rebalance
+	diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime
+	diff.Split = s.Split - other.Split
+	diff.Spill = s.Spill - other.Spill
+	diff.SpillTime = s.SpillTime - other.SpillTime
+	diff.Write = s.Write - other.Write
+	diff.WriteTime = s.WriteTime - other.WriteTime
+	return diff
+}

+ 424 - 0
Godeps/_workspace/src/github.com/boltdb/bolt/tx_test.go

@@ -0,0 +1,424 @@
+package bolt_test
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"testing"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+// Ensure that committing a closed transaction returns an error.
+func TestTx_Commit_Closed(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(true)
+	tx.CreateBucket([]byte("foo"))
+	ok(t, tx.Commit())
+	equals(t, tx.Commit(), bolt.ErrTxClosed)
+}
+
+// Ensure that rolling back a closed transaction returns an error.
+func TestTx_Rollback_Closed(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(true)
+	ok(t, tx.Rollback())
+	equals(t, tx.Rollback(), bolt.ErrTxClosed)
+}
+
+// Ensure that committing a read-only transaction returns an error.
+func TestTx_Commit_ReadOnly(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(false)
+	equals(t, tx.Commit(), bolt.ErrTxNotWritable)
+}
+
+// Ensure that a transaction can retrieve a cursor on the root bucket.
+func TestTx_Cursor(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.CreateBucket([]byte("woojits"))
+		c := tx.Cursor()
+
+		k, v := c.First()
+		equals(t, "widgets", string(k))
+		assert(t, v == nil, "")
+
+		k, v = c.Next()
+		equals(t, "woojits", string(k))
+		assert(t, v == nil, "")
+
+		k, v = c.Next()
+		assert(t, k == nil, "")
+		assert(t, v == nil, "")
+
+		return nil
+	})
+}
+
+// Ensure that creating a bucket with a read-only transaction returns an error.
+func TestTx_CreateBucket_ReadOnly(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.View(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("foo"))
+		assert(t, b == nil, "")
+		equals(t, bolt.ErrTxNotWritable, err)
+		return nil
+	})
+}
+
+// Ensure that creating a bucket on a closed transaction returns an error.
+func TestTx_CreateBucket_Closed(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(true)
+	tx.Commit()
+	b, err := tx.CreateBucket([]byte("foo"))
+	assert(t, b == nil, "")
+	equals(t, bolt.ErrTxClosed, err)
+}
+
+// Ensure that a Tx can retrieve a bucket.
+func TestTx_Bucket(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		b := tx.Bucket([]byte("widgets"))
+		assert(t, b != nil, "")
+		return nil
+	})
+}
+
+// Ensure that a Tx retrieving a non-existent key returns nil.
+func TestTx_Get_Missing(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		value := tx.Bucket([]byte("widgets")).Get([]byte("no_such_key"))
+		assert(t, value == nil, "")
+		return nil
+	})
+}
+
+// Ensure that a bucket can be created and retrieved.
+func TestTx_CreateBucket(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	// Create a bucket.
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("widgets"))
+		assert(t, b != nil, "")
+		ok(t, err)
+		return nil
+	})
+
+	// Read the bucket through a separate transaction.
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		assert(t, b != nil, "")
+		return nil
+	})
+}
+
+// Ensure that a bucket can be created if it doesn't already exist.
+func TestTx_CreateBucketIfNotExists(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucketIfNotExists([]byte("widgets"))
+		assert(t, b != nil, "")
+		ok(t, err)
+
+		b, err = tx.CreateBucketIfNotExists([]byte("widgets"))
+		assert(t, b != nil, "")
+		ok(t, err)
+
+		b, err = tx.CreateBucketIfNotExists([]byte{})
+		assert(t, b == nil, "")
+		equals(t, bolt.ErrBucketNameRequired, err)
+
+		b, err = tx.CreateBucketIfNotExists(nil)
+		assert(t, b == nil, "")
+		equals(t, bolt.ErrBucketNameRequired, err)
+		return nil
+	})
+
+	// Read the bucket through a separate transaction.
+	db.View(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("widgets"))
+		assert(t, b != nil, "")
+		return nil
+	})
+}
+
+// Ensure that a bucket cannot be created twice.
+func TestTx_CreateBucket_Exists(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	// Create a bucket.
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("widgets"))
+		assert(t, b != nil, "")
+		ok(t, err)
+		return nil
+	})
+
+	// Create the same bucket again.
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket([]byte("widgets"))
+		assert(t, b == nil, "")
+		equals(t, bolt.ErrBucketExists, err)
+		return nil
+	})
+}
+
+// Ensure that a bucket is created with a non-blank name.
+func TestTx_CreateBucket_NameRequired(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		b, err := tx.CreateBucket(nil)
+		assert(t, b == nil, "")
+		equals(t, bolt.ErrBucketNameRequired, err)
+		return nil
+	})
+}
+
+// Ensure that a bucket can be deleted.
+func TestTx_DeleteBucket(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+
+	// Create a bucket and add a value.
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		return nil
+	})
+
+	// Delete the bucket and make sure we can't get the value.
+	db.Update(func(tx *bolt.Tx) error {
+		ok(t, tx.DeleteBucket([]byte("widgets")))
+		assert(t, tx.Bucket([]byte("widgets")) == nil, "")
+		return nil
+	})
+
+	db.Update(func(tx *bolt.Tx) error {
+		// Create the bucket again and make sure there's not a phantom value.
+		b, err := tx.CreateBucket([]byte("widgets"))
+		assert(t, b != nil, "")
+		ok(t, err)
+		assert(t, tx.Bucket([]byte("widgets")).Get([]byte("foo")) == nil, "")
+		return nil
+	})
+}
+
+// Ensure that deleting a bucket on a closed transaction returns an error.
+func TestTx_DeleteBucket_Closed(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	tx, _ := db.Begin(true)
+	tx.Commit()
+	equals(t, tx.DeleteBucket([]byte("foo")), bolt.ErrTxClosed)
+}
+
+// Ensure that deleting a bucket with a read-only transaction returns an error.
+func TestTx_DeleteBucket_ReadOnly(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.View(func(tx *bolt.Tx) error {
+		equals(t, tx.DeleteBucket([]byte("foo")), bolt.ErrTxNotWritable)
+		return nil
+	})
+}
+
+// Ensure that nothing happens when deleting a bucket that doesn't exist.
+func TestTx_DeleteBucket_NotFound(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		equals(t, bolt.ErrBucketNotFound, tx.DeleteBucket([]byte("widgets")))
+		return nil
+	})
+}
+
+// Ensure that Tx commit handlers are called after a transaction successfully commits.
+func TestTx_OnCommit(t *testing.T) {
+	var x int
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.OnCommit(func() { x += 1 })
+		tx.OnCommit(func() { x += 2 })
+		_, err := tx.CreateBucket([]byte("widgets"))
+		return err
+	})
+	equals(t, 3, x)
+}
+
+// Ensure that Tx commit handlers are NOT called after a transaction rolls back.
+func TestTx_OnCommit_Rollback(t *testing.T) {
+	var x int
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.OnCommit(func() { x += 1 })
+		tx.OnCommit(func() { x += 2 })
+		tx.CreateBucket([]byte("widgets"))
+		return errors.New("rollback this commit")
+	})
+	equals(t, 0, x)
+}
+
+// Ensure that the database can be copied to a file path.
+func TestTx_CopyFile(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	var dest = tempfile()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("bat"))
+		return nil
+	})
+
+	ok(t, db.View(func(tx *bolt.Tx) error { return tx.CopyFile(dest, 0600) }))
+
+	db2, err := bolt.Open(dest, 0600, nil)
+	ok(t, err)
+	defer db2.Close()
+
+	db2.View(func(tx *bolt.Tx) error {
+		equals(t, []byte("bar"), tx.Bucket([]byte("widgets")).Get([]byte("foo")))
+		equals(t, []byte("bat"), tx.Bucket([]byte("widgets")).Get([]byte("baz")))
+		return nil
+	})
+}
+
+type failWriterError struct{}
+
+func (failWriterError) Error() string {
+	return "error injected for tests"
+}
+
+type failWriter struct {
+	// fail after this many bytes
+	After int
+}
+
+func (f *failWriter) Write(p []byte) (n int, err error) {
+	n = len(p)
+	if n > f.After {
+		n = f.After
+		err = failWriterError{}
+	}
+	f.After -= n
+	return n, err
+}
+
+// Ensure that Copy handles write errors right.
+func TestTx_CopyFile_Error_Meta(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("bat"))
+		return nil
+	})
+
+	err := db.View(func(tx *bolt.Tx) error { return tx.Copy(&failWriter{}) })
+	equals(t, err.Error(), "meta copy: error injected for tests")
+}
+
+// Ensure that Copy handles write errors right.
+func TestTx_CopyFile_Error_Normal(t *testing.T) {
+	db := NewTestDB()
+	defer db.Close()
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("bat"))
+		return nil
+	})
+
+	err := db.View(func(tx *bolt.Tx) error { return tx.Copy(&failWriter{3 * db.Info().PageSize}) })
+	equals(t, err.Error(), "error injected for tests")
+}
+
+func ExampleTx_Rollback() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Create a bucket.
+	db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucket([]byte("widgets"))
+		return err
+	})
+
+	// Set a value for a key.
+	db.Update(func(tx *bolt.Tx) error {
+		return tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+	})
+
+	// Update the key but rollback the transaction so it never saves.
+	tx, _ := db.Begin(true)
+	b := tx.Bucket([]byte("widgets"))
+	b.Put([]byte("foo"), []byte("baz"))
+	tx.Rollback()
+
+	// Ensure that our original value is still set.
+	db.View(func(tx *bolt.Tx) error {
+		value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+		fmt.Printf("The value for 'foo' is still: %s\n", value)
+		return nil
+	})
+
+	// Output:
+	// The value for 'foo' is still: bar
+}
+
+func ExampleTx_CopyFile() {
+	// Open the database.
+	db, _ := bolt.Open(tempfile(), 0666, nil)
+	defer os.Remove(db.Path())
+	defer db.Close()
+
+	// Create a bucket and a key.
+	db.Update(func(tx *bolt.Tx) error {
+		tx.CreateBucket([]byte("widgets"))
+		tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
+		return nil
+	})
+
+	// Copy the database to another file.
+	toFile := tempfile()
+	db.View(func(tx *bolt.Tx) error { return tx.CopyFile(toFile, 0666) })
+	defer os.Remove(toFile)
+
+	// Open the cloned database.
+	db2, _ := bolt.Open(toFile, 0666, nil)
+	defer db2.Close()
+
+	// Ensure that the key exists in the copy.
+	db2.View(func(tx *bolt.Tx) error {
+		value := tx.Bucket([]byte("widgets")).Get([]byte("foo"))
+		fmt.Printf("The value for 'foo' in the clone is: %s\n", value)
+		return nil
+	})
+
+	// Output:
+	// The value for 'foo' in the clone is: bar
+}

+ 1 - 0
Godeps/_workspace/src/github.com/google/btree/.travis.yml

@@ -0,0 +1 @@
+language: go

+ 202 - 0
Godeps/_workspace/src/github.com/google/btree/LICENSE

@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

+ 12 - 0
Godeps/_workspace/src/github.com/google/btree/README.md

@@ -0,0 +1,12 @@
+# BTree implementation for Go
+
+![Travis CI Build Status](https://api.travis-ci.org/google/btree.svg?branch=master)
+
+This package provides an in-memory B-Tree implementation for Go, useful as a
+an ordered, mutable data structure.
+
+The API is based off of the wonderful
+http://godoc.org/github.com/petar/GoLLRB/llrb, and is meant to allow btree to
+act as a drop-in replacement for gollrb trees.
+
+See http://godoc.org/github.com/google/btree for documentation.

+ 571 - 0
Godeps/_workspace/src/github.com/google/btree/btree.go

@@ -0,0 +1,571 @@
+// Copyright 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package btree implements in-memory B-Trees of arbitrary degree.
+//
+// btree implements an in-memory B-Tree for use as an ordered data structure.
+// It is not meant for persistent storage solutions.
+//
+// It has a flatter structure than an equivalent red-black or other binary tree,
+// which in some cases yields better memory usage and/or performance.
+// See some discussion on the matter here:
+//   http://google-opensource.blogspot.com/2013/01/c-containers-that-save-memory-and-time.html
+// Note, though, that this project is in no way related to the C++ B-Tree
+// implmentation written about there.
+//
+// Within this tree, each node contains a slice of items and a (possibly nil)
+// slice of children.  For basic numeric values or raw structs, this can cause
+// efficiency differences when compared to equivalent C++ template code that
+// stores values in arrays within the node:
+//   * Due to the overhead of storing values as interfaces (each
+//     value needs to be stored as the value itself, then 2 words for the
+//     interface pointing to that value and its type), resulting in higher
+//     memory use.
+//   * Since interfaces can point to values anywhere in memory, values are
+//     most likely not stored in contiguous blocks, resulting in a higher
+//     number of cache misses.
+// These issues don't tend to matter, though, when working with strings or other
+// heap-allocated structures, since C++-equivalent structures also must store
+// pointers and also distribute their values across the heap.
+//
+// This implementation is designed to be a drop-in replacement to gollrb.LLRB
+// trees, (http://github.com/petar/gollrb), an excellent and probably the most
+// widely used ordered tree implementation in the Go ecosystem currently.
+// Its functions, therefore, exactly mirror those of
+// llrb.LLRB where possible.  Unlike gollrb, though, we currently don't
+// support storing multiple equivalent values or backwards iteration.
+package btree
+
+import (
+	"fmt"
+	"io"
+	"sort"
+	"strings"
+)
+
+// Item represents a single object in the tree.
+type Item interface {
+	// Less tests whether the current item is less than the given argument.
+	//
+	// This must provide a strict weak ordering.
+	// If !a.Less(b) && !b.Less(a), we treat this to mean a == b (i.e. we can only
+	// hold one of either a or b in the tree).
+	Less(than Item) bool
+}
+
+// ItemIterator allows callers of Ascend* to iterate in-order over portions of
+// the tree.  When this function returns false, iteration will stop and the
+// associated Ascend* function will immediately return.
+type ItemIterator func(i Item) bool
+
+// New creates a new B-Tree with the given degree.
+//
+// New(2), for example, will create a 2-3-4 tree (each node contains 1-3 items
+// and 2-4 children).
+func New(degree int) *BTree {
+	if degree <= 1 {
+		panic("bad degree")
+	}
+	return &BTree{
+		degree:   degree,
+		freelist: make([]*node, 0, 32),
+	}
+}
+
+// items stores items in a node.
+type items []Item
+
+// insertAt inserts a value into the given index, pushing all subsequent values
+// forward.
+func (s *items) insertAt(index int, item Item) {
+	*s = append(*s, nil)
+	if index < len(*s) {
+		copy((*s)[index+1:], (*s)[index:])
+	}
+	(*s)[index] = item
+}
+
+// removeAt removes a value at a given index, pulling all subsequent values
+// back.
+func (s *items) removeAt(index int) Item {
+	item := (*s)[index]
+	copy((*s)[index:], (*s)[index+1:])
+	*s = (*s)[:len(*s)-1]
+	return item
+}
+
+// pop removes and returns the last element in the list.
+func (s *items) pop() (out Item) {
+	index := len(*s) - 1
+	out, *s = (*s)[index], (*s)[:index]
+	return
+}
+
+// find returns the index where the given item should be inserted into this
+// list.  'found' is true if the item already exists in the list at the given
+// index.
+func (s items) find(item Item) (index int, found bool) {
+	i := sort.Search(len(s), func(i int) bool {
+		return item.Less(s[i])
+	})
+	if i > 0 && !s[i-1].Less(item) {
+		return i - 1, true
+	}
+	return i, false
+}
+
+// children stores child nodes in a node.
+type children []*node
+
+// insertAt inserts a value into the given index, pushing all subsequent values
+// forward.
+func (s *children) insertAt(index int, n *node) {
+	*s = append(*s, nil)
+	if index < len(*s) {
+		copy((*s)[index+1:], (*s)[index:])
+	}
+	(*s)[index] = n
+}
+
+// removeAt removes a value at a given index, pulling all subsequent values
+// back.
+func (s *children) removeAt(index int) *node {
+	n := (*s)[index]
+	copy((*s)[index:], (*s)[index+1:])
+	*s = (*s)[:len(*s)-1]
+	return n
+}
+
+// pop removes and returns the last element in the list.
+func (s *children) pop() (out *node) {
+	index := len(*s) - 1
+	out, *s = (*s)[index], (*s)[:index]
+	return
+}
+
+// node is an internal node in a tree.
+//
+// It must at all times maintain the invariant that either
+//   * len(children) == 0, len(items) unconstrained
+//   * len(children) == len(items) + 1
+type node struct {
+	items    items
+	children children
+	t        *BTree
+}
+
+// split splits the given node at the given index.  The current node shrinks,
+// and this function returns the item that existed at that index and a new node
+// containing all items/children after it.
+func (n *node) split(i int) (Item, *node) {
+	item := n.items[i]
+	next := n.t.newNode()
+	next.items = append(next.items, n.items[i+1:]...)
+	n.items = n.items[:i]
+	if len(n.children) > 0 {
+		next.children = append(next.children, n.children[i+1:]...)
+		n.children = n.children[:i+1]
+	}
+	return item, next
+}
+
+// maybeSplitChild checks if a child should be split, and if so splits it.
+// Returns whether or not a split occurred.
+func (n *node) maybeSplitChild(i, maxItems int) bool {
+	if len(n.children[i].items) < maxItems {
+		return false
+	}
+	first := n.children[i]
+	item, second := first.split(maxItems / 2)
+	n.items.insertAt(i, item)
+	n.children.insertAt(i+1, second)
+	return true
+}
+
+// insert inserts an item into the subtree rooted at this node, making sure
+// no nodes in the subtree exceed maxItems items.  Should an equivalent item be
+// be found/replaced by insert, it will be returned.
+func (n *node) insert(item Item, maxItems int) Item {
+	i, found := n.items.find(item)
+	if found {
+		out := n.items[i]
+		n.items[i] = item
+		return out
+	}
+	if len(n.children) == 0 {
+		n.items.insertAt(i, item)
+		return nil
+	}
+	if n.maybeSplitChild(i, maxItems) {
+		inTree := n.items[i]
+		switch {
+		case item.Less(inTree):
+			// no change, we want first split node
+		case inTree.Less(item):
+			i++ // we want second split node
+		default:
+			out := n.items[i]
+			n.items[i] = item
+			return out
+		}
+	}
+	return n.children[i].insert(item, maxItems)
+}
+
+// get finds the given key in the subtree and returns it.
+func (n *node) get(key Item) Item {
+	i, found := n.items.find(key)
+	if found {
+		return n.items[i]
+	} else if len(n.children) > 0 {
+		return n.children[i].get(key)
+	}
+	return nil
+}
+
+// toRemove details what item to remove in a node.remove call.
+type toRemove int
+
+const (
+	removeItem toRemove = iota // removes the given item
+	removeMin                  // removes smallest item in the subtree
+	removeMax                  // removes largest item in the subtree
+)
+
+// remove removes an item from the subtree rooted at this node.
+func (n *node) remove(item Item, minItems int, typ toRemove) Item {
+	var i int
+	var found bool
+	switch typ {
+	case removeMax:
+		if len(n.children) == 0 {
+			return n.items.pop()
+		}
+		i = len(n.items)
+	case removeMin:
+		if len(n.children) == 0 {
+			return n.items.removeAt(0)
+		}
+		i = 0
+	case removeItem:
+		i, found = n.items.find(item)
+		if len(n.children) == 0 {
+			if found {
+				return n.items.removeAt(i)
+			}
+			return nil
+		}
+	default:
+		panic("invalid type")
+	}
+	// If we get to here, we have children.
+	child := n.children[i]
+	if len(child.items) <= minItems {
+		return n.growChildAndRemove(i, item, minItems, typ)
+	}
+	// Either we had enough items to begin with, or we've done some
+	// merging/stealing, because we've got enough now and we're ready to return
+	// stuff.
+	if found {
+		// The item exists at index 'i', and the child we've selected can give us a
+		// predecessor, since if we've gotten here it's got > minItems items in it.
+		out := n.items[i]
+		// We use our special-case 'remove' call with typ=maxItem to pull the
+		// predecessor of item i (the rightmost leaf of our immediate left child)
+		// and set it into where we pulled the item from.
+		n.items[i] = child.remove(nil, minItems, removeMax)
+		return out
+	}
+	// Final recursive call.  Once we're here, we know that the item isn't in this
+	// node and that the child is big enough to remove from.
+	return child.remove(item, minItems, typ)
+}
+
+// growChildAndRemove grows child 'i' to make sure it's possible to remove an
+// item from it while keeping it at minItems, then calls remove to actually
+// remove it.
+//
+// Most documentation says we have to do two sets of special casing:
+//   1) item is in this node
+//   2) item is in child
+// In both cases, we need to handle the two subcases:
+//   A) node has enough values that it can spare one
+//   B) node doesn't have enough values
+// For the latter, we have to check:
+//   a) left sibling has node to spare
+//   b) right sibling has node to spare
+//   c) we must merge
+// To simplify our code here, we handle cases #1 and #2 the same:
+// If a node doesn't have enough items, we make sure it does (using a,b,c).
+// We then simply redo our remove call, and the second time (regardless of
+// whether we're in case 1 or 2), we'll have enough items and can guarantee
+// that we hit case A.
+func (n *node) growChildAndRemove(i int, item Item, minItems int, typ toRemove) Item {
+	child := n.children[i]
+	if i > 0 && len(n.children[i-1].items) > minItems {
+		// Steal from left child
+		stealFrom := n.children[i-1]
+		stolenItem := stealFrom.items.pop()
+		child.items.insertAt(0, n.items[i-1])
+		n.items[i-1] = stolenItem
+		if len(stealFrom.children) > 0 {
+			child.children.insertAt(0, stealFrom.children.pop())
+		}
+	} else if i < len(n.items) && len(n.children[i+1].items) > minItems {
+		// steal from right child
+		stealFrom := n.children[i+1]
+		stolenItem := stealFrom.items.removeAt(0)
+		child.items = append(child.items, n.items[i])
+		n.items[i] = stolenItem
+		if len(stealFrom.children) > 0 {
+			child.children = append(child.children, stealFrom.children.removeAt(0))
+		}
+	} else {
+		if i >= len(n.items) {
+			i--
+			child = n.children[i]
+		}
+		// merge with right child
+		mergeItem := n.items.removeAt(i)
+		mergeChild := n.children.removeAt(i + 1)
+		child.items = append(child.items, mergeItem)
+		child.items = append(child.items, mergeChild.items...)
+		child.children = append(child.children, mergeChild.children...)
+		n.t.freeNode(mergeChild)
+	}
+	return n.remove(item, minItems, typ)
+}
+
+// iterate provides a simple method for iterating over elements in the tree.
+// It could probably use some work to be extra-efficient (it calls from() a
+// little more than it should), but it works pretty well for now.
+//
+// It requires that 'from' and 'to' both return true for values we should hit
+// with the iterator.  It should also be the case that 'from' returns true for
+// values less than or equal to values 'to' returns true for, and 'to'
+// returns true for values greater than or equal to those that 'from'
+// does.
+func (n *node) iterate(from, to func(Item) bool, iter ItemIterator) bool {
+	for i, item := range n.items {
+		if !from(item) {
+			continue
+		}
+		if len(n.children) > 0 && !n.children[i].iterate(from, to, iter) {
+			return false
+		}
+		if !to(item) {
+			return false
+		}
+		if !iter(item) {
+			return false
+		}
+	}
+	if len(n.children) > 0 {
+		return n.children[len(n.children)-1].iterate(from, to, iter)
+	}
+	return true
+}
+
+// Used for testing/debugging purposes.
+func (n *node) print(w io.Writer, level int) {
+	fmt.Fprintf(w, "%sNODE:%v\n", strings.Repeat("  ", level), n.items)
+	for _, c := range n.children {
+		c.print(w, level+1)
+	}
+}
+
+// BTree is an implementation of a B-Tree.
+//
+// BTree stores Item instances in an ordered structure, allowing easy insertion,
+// removal, and iteration.
+//
+// Write operations are not safe for concurrent mutation by multiple
+// goroutines, but Read operations are.
+type BTree struct {
+	degree   int
+	length   int
+	root     *node
+	freelist []*node
+}
+
+// maxItems returns the max number of items to allow per node.
+func (t *BTree) maxItems() int {
+	return t.degree*2 - 1
+}
+
+// minItems returns the min number of items to allow per node (ignored for the
+// root node).
+func (t *BTree) minItems() int {
+	return t.degree - 1
+}
+
+func (t *BTree) newNode() (n *node) {
+	index := len(t.freelist) - 1
+	if index < 0 {
+		return &node{t: t}
+	}
+	t.freelist, n = t.freelist[:index], t.freelist[index]
+	return
+}
+
+func (t *BTree) freeNode(n *node) {
+	if len(t.freelist) < cap(t.freelist) {
+    for i := range n.items {
+      n.items[i] = nil  // clear to allow GC
+    }
+		n.items = n.items[:0]
+    for i := range n.children {
+      n.children[i] = nil  // clear to allow GC
+    }
+		n.children = n.children[:0]
+		t.freelist = append(t.freelist, n)
+	}
+}
+
+// ReplaceOrInsert adds the given item to the tree.  If an item in the tree
+// already equals the given one, it is removed from the tree and returned.
+// Otherwise, nil is returned.
+//
+// nil cannot be added to the tree (will panic).
+func (t *BTree) ReplaceOrInsert(item Item) Item {
+	if item == nil {
+		panic("nil item being added to BTree")
+	}
+	if t.root == nil {
+		t.root = t.newNode()
+		t.root.items = append(t.root.items, item)
+		t.length++
+		return nil
+	} else if len(t.root.items) >= t.maxItems() {
+		item2, second := t.root.split(t.maxItems() / 2)
+		oldroot := t.root
+		t.root = t.newNode()
+		t.root.items = append(t.root.items, item2)
+		t.root.children = append(t.root.children, oldroot, second)
+	}
+	out := t.root.insert(item, t.maxItems())
+	if out == nil {
+		t.length++
+	}
+	return out
+}
+
+// Delete removes an item equal to the passed in item from the tree, returning
+// it.  If no such item exists, returns nil.
+func (t *BTree) Delete(item Item) Item {
+	return t.deleteItem(item, removeItem)
+}
+
+// DeleteMin removes the smallest item in the tree and returns it.
+// If no such item exists, returns nil.
+func (t *BTree) DeleteMin() Item {
+	return t.deleteItem(nil, removeMin)
+}
+
+// DeleteMax removes the largest item in the tree and returns it.
+// If no such item exists, returns nil.
+func (t *BTree) DeleteMax() Item {
+	return t.deleteItem(nil, removeMax)
+}
+
+func (t *BTree) deleteItem(item Item, typ toRemove) Item {
+	if t.root == nil || len(t.root.items) == 0 {
+		return nil
+	}
+	out := t.root.remove(item, t.minItems(), typ)
+	if len(t.root.items) == 0 && len(t.root.children) > 0 {
+		oldroot := t.root
+		t.root = t.root.children[0]
+		t.freeNode(oldroot)
+	}
+	if out != nil {
+		t.length--
+	}
+	return out
+}
+
+// AscendRange calls the iterator for every value in the tree within the range
+// [greaterOrEqual, lessThan), until iterator returns false.
+func (t *BTree) AscendRange(greaterOrEqual, lessThan Item, iterator ItemIterator) {
+	if t.root == nil {
+		return
+	}
+	t.root.iterate(
+		func(a Item) bool { return !a.Less(greaterOrEqual) },
+		func(a Item) bool { return a.Less(lessThan) },
+		iterator)
+}
+
+// AscendLessThan calls the iterator for every value in the tree within the range
+// [first, pivot), until iterator returns false.
+func (t *BTree) AscendLessThan(pivot Item, iterator ItemIterator) {
+	if t.root == nil {
+		return
+	}
+	t.root.iterate(
+		func(a Item) bool { return true },
+		func(a Item) bool { return a.Less(pivot) },
+		iterator)
+}
+
+// AscendGreaterOrEqual calls the iterator for every value in the tree within
+// the range [pivot, last], until iterator returns false.
+func (t *BTree) AscendGreaterOrEqual(pivot Item, iterator ItemIterator) {
+	if t.root == nil {
+		return
+	}
+	t.root.iterate(
+		func(a Item) bool { return !a.Less(pivot) },
+		func(a Item) bool { return true },
+		iterator)
+}
+
+// Ascend calls the iterator for every value in the tree within the range
+// [first, last], until iterator returns false.
+func (t *BTree) Ascend(iterator ItemIterator) {
+	if t.root == nil {
+		return
+	}
+	t.root.iterate(
+		func(a Item) bool { return true },
+		func(a Item) bool { return true },
+		iterator)
+}
+
+// Get looks for the key item in the tree, returning it.  It returns nil if
+// unable to find that item.
+func (t *BTree) Get(key Item) Item {
+	if t.root == nil {
+		return nil
+	}
+	return t.root.get(key)
+}
+
+// Has returns true if the given key is in the tree.
+func (t *BTree) Has(key Item) bool {
+	return t.Get(key) != nil
+}
+
+// Len returns the number of items currently in the tree.
+func (t *BTree) Len() int {
+	return t.length
+}
+
+// Int implements the Item interface for integers.
+type Int int
+
+// Less returns true if int(a) < int(b).
+func (a Int) Less(b Item) bool {
+	return a < b.(Int)
+}

+ 76 - 0
Godeps/_workspace/src/github.com/google/btree/btree_mem.go

@@ -0,0 +1,76 @@
+// Copyright 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build ignore
+
+// This binary compares memory usage between btree and gollrb.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"math/rand"
+	"runtime"
+	"time"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/google/btree"
+	"github.com/petar/GoLLRB/llrb"
+)
+
+var (
+	size   = flag.Int("size", 1000000, "size of the tree to build")
+	degree = flag.Int("degree", 8, "degree of btree")
+	gollrb = flag.Bool("llrb", false, "use llrb instead of btree")
+)
+
+func main() {
+	flag.Parse()
+	vals := rand.Perm(*size)
+	var t, v interface{}
+	v = vals
+	var stats runtime.MemStats
+	for i := 0; i < 10; i++ {
+		runtime.GC()
+	}
+	fmt.Println("-------- BEFORE ----------")
+	runtime.ReadMemStats(&stats)
+	fmt.Printf("%+v\n", stats)
+	start := time.Now()
+	if *gollrb {
+		tr := llrb.New()
+		for _, v := range vals {
+			tr.ReplaceOrInsert(llrb.Int(v))
+		}
+		t = tr // keep it around
+	} else {
+		tr := btree.New(*degree)
+		for _, v := range vals {
+			tr.ReplaceOrInsert(btree.Int(v))
+		}
+		t = tr // keep it around
+	}
+	fmt.Printf("%v inserts in %v\n", *size, time.Since(start))
+	fmt.Println("-------- AFTER ----------")
+	runtime.ReadMemStats(&stats)
+	fmt.Printf("%+v\n", stats)
+	for i := 0; i < 10; i++ {
+		runtime.GC()
+	}
+	fmt.Println("-------- AFTER GC ----------")
+	runtime.ReadMemStats(&stats)
+	fmt.Printf("%+v\n", stats)
+	if t == v {
+		fmt.Println("to make sure vals and tree aren't GC'd")
+	}
+}

+ 293 - 0
Godeps/_workspace/src/github.com/google/btree/btree_test.go

@@ -0,0 +1,293 @@
+// Copyright 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package btree
+
+import (
+	"flag"
+	"fmt"
+	"math/rand"
+	"reflect"
+	"testing"
+	"time"
+)
+
+func init() {
+	seed := time.Now().Unix()
+	fmt.Println(seed)
+	rand.Seed(seed)
+}
+
+// perm returns a random permutation of n Int items in the range [0, n).
+func perm(n int) (out []Item) {
+	for _, v := range rand.Perm(n) {
+		out = append(out, Int(v))
+	}
+	return
+}
+
+// rang returns an ordered list of Int items in the range [0, n).
+func rang(n int) (out []Item) {
+	for i := 0; i < n; i++ {
+		out = append(out, Int(i))
+	}
+	return
+}
+
+// all extracts all items from a tree in order as a slice.
+func all(t *BTree) (out []Item) {
+	t.Ascend(func(a Item) bool {
+		out = append(out, a)
+		return true
+	})
+	return
+}
+
+var btreeDegree = flag.Int("degree", 32, "B-Tree degree")
+
+func TestBTree(t *testing.T) {
+	tr := New(*btreeDegree)
+	const treeSize = 10000
+	for i := 0; i < 10; i++ {
+		for _, item := range perm(treeSize) {
+			if x := tr.ReplaceOrInsert(item); x != nil {
+				t.Fatal("insert found item", item)
+			}
+		}
+		for _, item := range perm(treeSize) {
+			if x := tr.ReplaceOrInsert(item); x == nil {
+				t.Fatal("insert didn't find item", item)
+			}
+		}
+		got := all(tr)
+		want := rang(treeSize)
+		if !reflect.DeepEqual(got, want) {
+			t.Fatalf("mismatch:\n got: %v\nwant: %v", got, want)
+		}
+		for _, item := range perm(treeSize) {
+			if x := tr.Delete(item); x == nil {
+				t.Fatalf("didn't find %v", item)
+			}
+		}
+		if got = all(tr); len(got) > 0 {
+			t.Fatalf("some left!: %v", got)
+		}
+	}
+}
+
+func ExampleBTree() {
+	tr := New(*btreeDegree)
+	for i := Int(0); i < 10; i++ {
+		tr.ReplaceOrInsert(i)
+	}
+	fmt.Println("len:       ", tr.Len())
+	fmt.Println("get3:      ", tr.Get(Int(3)))
+	fmt.Println("get100:    ", tr.Get(Int(100)))
+	fmt.Println("del4:      ", tr.Delete(Int(4)))
+	fmt.Println("del100:    ", tr.Delete(Int(100)))
+	fmt.Println("replace5:  ", tr.ReplaceOrInsert(Int(5)))
+	fmt.Println("replace100:", tr.ReplaceOrInsert(Int(100)))
+	fmt.Println("delmin:    ", tr.DeleteMin())
+	fmt.Println("delmax:    ", tr.DeleteMax())
+	fmt.Println("len:       ", tr.Len())
+	// Output:
+	// len:        10
+	// get3:       3
+	// get100:     <nil>
+	// del4:       4
+	// del100:     <nil>
+	// replace5:   5
+	// replace100: <nil>
+	// delmin:     0
+	// delmax:     100
+	// len:        8
+}
+
+func TestDeleteMin(t *testing.T) {
+	tr := New(3)
+	for _, v := range perm(100) {
+		tr.ReplaceOrInsert(v)
+	}
+	var got []Item
+	for v := tr.DeleteMin(); v != nil; v = tr.DeleteMin() {
+		got = append(got, v)
+	}
+	if want := rang(100); !reflect.DeepEqual(got, want) {
+		t.Fatalf("ascendrange:\n got: %v\nwant: %v", got, want)
+	}
+}
+
+func TestDeleteMax(t *testing.T) {
+	tr := New(3)
+	for _, v := range perm(100) {
+		tr.ReplaceOrInsert(v)
+	}
+	var got []Item
+	for v := tr.DeleteMax(); v != nil; v = tr.DeleteMax() {
+		got = append(got, v)
+	}
+	// Reverse our list.
+	for i := 0; i < len(got)/2; i++ {
+		got[i], got[len(got)-i-1] = got[len(got)-i-1], got[i]
+	}
+	if want := rang(100); !reflect.DeepEqual(got, want) {
+		t.Fatalf("ascendrange:\n got: %v\nwant: %v", got, want)
+	}
+}
+
+func TestAscendRange(t *testing.T) {
+	tr := New(2)
+	for _, v := range perm(100) {
+		tr.ReplaceOrInsert(v)
+	}
+	var got []Item
+	tr.AscendRange(Int(40), Int(60), func(a Item) bool {
+		got = append(got, a)
+		return true
+	})
+	if want := rang(100)[40:60]; !reflect.DeepEqual(got, want) {
+		t.Fatalf("ascendrange:\n got: %v\nwant: %v", got, want)
+	}
+	got = got[:0]
+	tr.AscendRange(Int(40), Int(60), func(a Item) bool {
+		if a.(Int) > 50 {
+			return false
+		}
+		got = append(got, a)
+		return true
+	})
+	if want := rang(100)[40:51]; !reflect.DeepEqual(got, want) {
+		t.Fatalf("ascendrange:\n got: %v\nwant: %v", got, want)
+	}
+}
+
+func TestAscendLessThan(t *testing.T) {
+	tr := New(*btreeDegree)
+	for _, v := range perm(100) {
+		tr.ReplaceOrInsert(v)
+	}
+	var got []Item
+	tr.AscendLessThan(Int(60), func(a Item) bool {
+		got = append(got, a)
+		return true
+	})
+	if want := rang(100)[:60]; !reflect.DeepEqual(got, want) {
+		t.Fatalf("ascendrange:\n got: %v\nwant: %v", got, want)
+	}
+	got = got[:0]
+	tr.AscendLessThan(Int(60), func(a Item) bool {
+		if a.(Int) > 50 {
+			return false
+		}
+		got = append(got, a)
+		return true
+	})
+	if want := rang(100)[:51]; !reflect.DeepEqual(got, want) {
+		t.Fatalf("ascendrange:\n got: %v\nwant: %v", got, want)
+	}
+}
+
+func TestAscendGreaterOrEqual(t *testing.T) {
+	tr := New(*btreeDegree)
+	for _, v := range perm(100) {
+		tr.ReplaceOrInsert(v)
+	}
+	var got []Item
+	tr.AscendGreaterOrEqual(Int(40), func(a Item) bool {
+		got = append(got, a)
+		return true
+	})
+	if want := rang(100)[40:]; !reflect.DeepEqual(got, want) {
+		t.Fatalf("ascendrange:\n got: %v\nwant: %v", got, want)
+	}
+	got = got[:0]
+	tr.AscendGreaterOrEqual(Int(40), func(a Item) bool {
+		if a.(Int) > 50 {
+			return false
+		}
+		got = append(got, a)
+		return true
+	})
+	if want := rang(100)[40:51]; !reflect.DeepEqual(got, want) {
+		t.Fatalf("ascendrange:\n got: %v\nwant: %v", got, want)
+	}
+}
+
+const benchmarkTreeSize = 10000
+
+func BenchmarkInsert(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	b.StartTimer()
+	i := 0
+	for i < b.N {
+		tr := New(*btreeDegree)
+		for _, item := range insertP {
+			tr.ReplaceOrInsert(item)
+			i++
+			if i >= b.N {
+				return
+			}
+		}
+	}
+}
+
+func BenchmarkDelete(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	removeP := perm(benchmarkTreeSize)
+	b.StartTimer()
+	i := 0
+	for i < b.N {
+		b.StopTimer()
+		tr := New(*btreeDegree)
+		for _, v := range insertP {
+			tr.ReplaceOrInsert(v)
+		}
+		b.StartTimer()
+		for _, item := range removeP {
+			tr.Delete(item)
+			i++
+			if i >= b.N {
+				return
+			}
+		}
+		if tr.Len() > 0 {
+			panic(tr.Len())
+		}
+	}
+}
+
+func BenchmarkGet(b *testing.B) {
+	b.StopTimer()
+	insertP := perm(benchmarkTreeSize)
+	removeP := perm(benchmarkTreeSize)
+	b.StartTimer()
+	i := 0
+	for i < b.N {
+		b.StopTimer()
+		tr := New(*btreeDegree)
+		for _, v := range insertP {
+			tr.ReplaceOrInsert(v)
+		}
+		b.StartTimer()
+		for _, item := range removeP {
+			tr.Get(item)
+			i++
+			if i >= b.N {
+				return
+			}
+		}
+	}
+}

+ 1 - 1
scripts/genproto.sh

@@ -5,7 +5,7 @@
 #
 
 PREFIX="github.com/coreos/etcd/Godeps/_workspace/src"
-DIRS="./wal/walpb ./etcdserver/etcdserverpb ./snap/snappb ./raft/raftpb ./migrate/etcd4pb"
+DIRS="./wal/walpb ./etcdserver/etcdserverpb ./snap/snappb ./raft/raftpb ./migrate/etcd4pb ./storage/storagepb"
 
 SHA="bc946d07d1016848dfd2507f90f0859c9471681e"
 

+ 83 - 0
storage/backend/backend.go

@@ -0,0 +1,83 @@
+package backend
+
+import (
+	"log"
+	"time"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+type Backend interface {
+	BatchTx() BatchTx
+	ForceCommit()
+	Close() error
+}
+
+type backend struct {
+	db *bolt.DB
+
+	batchInterval time.Duration
+	batchLimit    int
+	batchTx       *batchTx
+
+	stopc  chan struct{}
+	startc chan struct{}
+	donec  chan struct{}
+}
+
+func New(path string, d time.Duration, limit int) Backend {
+	db, err := bolt.Open(path, 0600, nil)
+	if err != nil {
+		log.Panicf("backend: cannot open database at %s (%v)", path, err)
+	}
+
+	b := &backend{
+		db: db,
+
+		batchInterval: d,
+		batchLimit:    limit,
+		batchTx:       &batchTx{},
+
+		stopc:  make(chan struct{}),
+		startc: make(chan struct{}),
+		donec:  make(chan struct{}),
+	}
+	b.batchTx.backend = b
+	go b.run()
+	<-b.startc
+	return b
+}
+
+// BatchTnx returns the current batch tx in coalescer. The tx can be used for read and
+// write operations. The write result can be retrieved within the same tx immediately.
+// The write result is isolated with other txs until the current one get committed.
+func (b *backend) BatchTx() BatchTx {
+	return b.batchTx
+}
+
+// force commit the current batching tx.
+func (b *backend) ForceCommit() {
+	b.batchTx.Commit()
+}
+
+func (b *backend) run() {
+	defer close(b.donec)
+
+	b.batchTx.Commit()
+	b.startc <- struct{}{}
+
+	for {
+		select {
+		case <-time.After(b.batchInterval):
+		case <-b.stopc:
+			return
+		}
+		b.batchTx.Commit()
+	}
+}
+
+func (b *backend) Close() error {
+	close(b.stopc)
+	<-b.donec
+	return b.db.Close()
+}

+ 36 - 0
storage/backend/backend_bench_test.go

@@ -0,0 +1,36 @@
+package backend
+
+import (
+	"crypto/rand"
+	"os"
+	"testing"
+	"time"
+)
+
+func BenchmarkBackendPut(b *testing.B) {
+	backend := New("test", 100*time.Millisecond, 10000)
+	defer backend.Close()
+	defer os.Remove("test")
+
+	// prepare keys
+	keys := make([][]byte, b.N)
+	for i := 0; i < b.N; i++ {
+		keys[i] = make([]byte, 64)
+		rand.Read(keys[i])
+	}
+	value := make([]byte, 128)
+	rand.Read(value)
+
+	batchTx := backend.BatchTx()
+
+	batchTx.Lock()
+	batchTx.UnsafeCreateBucket([]byte("test"))
+	batchTx.Unlock()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchTx.Lock()
+		batchTx.UnsafePut([]byte("test"), keys[i], value)
+		batchTx.Unlock()
+	}
+}

+ 29 - 0
storage/backend/backend_test.go

@@ -0,0 +1,29 @@
+package backend
+
+import (
+	"os"
+	"reflect"
+	"testing"
+	"time"
+)
+
+func TestBackendPut(t *testing.T) {
+	backend := New("test", 10*time.Second, 10000)
+	defer backend.Close()
+	defer os.Remove("test")
+
+	v := []byte("foo")
+
+	batchTx := backend.BatchTx()
+	batchTx.Lock()
+
+	batchTx.UnsafeCreateBucket([]byte("test"))
+
+	batchTx.UnsafePut([]byte("test"), []byte("foo"), v)
+	gv := batchTx.UnsafeRange([]byte("test"), v, nil, -1)
+	if !reflect.DeepEqual(gv[0], v) {
+		t.Errorf("v = %s, want %s", string(gv[0]), string(v))
+	}
+
+	batchTx.Unlock()
+}

+ 112 - 0
storage/backend/batch_tx.go

@@ -0,0 +1,112 @@
+package backend
+
+import (
+	"bytes"
+	"log"
+	"sync"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt"
+)
+
+type BatchTx interface {
+	Lock()
+	Unlock()
+	UnsafeCreateBucket(name []byte)
+	UnsafePut(bucketName []byte, key []byte, value []byte)
+	UnsafeRange(bucketName []byte, key, endKey []byte, limit int64) [][]byte
+	UnsafeDelete(bucketName []byte, key []byte)
+	Commit()
+}
+
+type batchTx struct {
+	sync.Mutex
+	tx      *bolt.Tx
+	backend *backend
+	pending int
+}
+
+func (t *batchTx) UnsafeCreateBucket(name []byte) {
+	_, err := t.tx.CreateBucket(name)
+	if err != nil && err != bolt.ErrBucketExists {
+		log.Fatalf("storage: cannot create bucket %s (%v)", string(name), err)
+	}
+}
+
+// before calling unsafePut, the caller MUST hold the lock on tnx.
+func (t *batchTx) UnsafePut(bucketName []byte, key []byte, value []byte) {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		log.Fatalf("storage: bucket %s does not exist", string(bucketName))
+	}
+	if err := bucket.Put(key, value); err != nil {
+		log.Fatalf("storage: cannot put key into bucket (%v)", err)
+	}
+	t.pending++
+	if t.pending > t.backend.batchLimit {
+		t.Commit()
+		t.pending = 0
+	}
+}
+
+// before calling unsafeRange, the caller MUST hold the lock on tnx.
+func (t *batchTx) UnsafeRange(bucketName []byte, key, endKey []byte, limit int64) [][]byte {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		log.Fatalf("storage: bucket %s does not exist", string(bucketName))
+	}
+
+	var vs [][]byte
+
+	if len(endKey) == 0 {
+		if v := bucket.Get(key); v == nil {
+			return vs
+		} else {
+			return append(vs, v)
+		}
+	}
+
+	c := bucket.Cursor()
+	for ck, cv := c.Seek(key); ck != nil && bytes.Compare(ck, endKey) < 0; ck, cv = c.Next() {
+		vs = append(vs, cv)
+	}
+
+	return vs
+}
+
+// before calling unsafeDelete, the caller MUST hold the lock on tnx.
+func (t *batchTx) UnsafeDelete(bucketName []byte, key []byte) {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		log.Fatalf("storage: bucket %s does not exist", string(bucketName))
+	}
+	err := bucket.Delete(key)
+	if err != nil {
+		log.Fatalf("storage: cannot delete key from bucket (%v)", err)
+	}
+	t.pending++
+	if t.pending > t.backend.batchLimit {
+		t.Commit()
+		t.pending = 0
+	}
+}
+
+// commitAndBegin commits a previous tx and begins a new writable one.
+func (t *batchTx) Commit() {
+	t.Lock()
+	defer t.Unlock()
+
+	var err error
+	// commit the last tx
+	if t.tx != nil {
+		err = t.tx.Commit()
+		if err != nil {
+			log.Fatalf("storage: cannot commit tx (%s)", err)
+		}
+	}
+
+	// begin a new tx
+	t.tx, err = t.backend.db.Begin(true)
+	if err != nil {
+		log.Fatalf("storage: cannot begin tx (%s)", err)
+	}
+}

+ 99 - 0
storage/index.go

@@ -0,0 +1,99 @@
+package storage
+
+import (
+	"log"
+	"sync"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/google/btree"
+)
+
+type index interface {
+	Get(key []byte, atIndex uint64) (index uint64, err error)
+	Put(key []byte, index uint64)
+	Tombstone(key []byte, index uint64) error
+	Compact(index uint64) map[uint64]struct{}
+}
+
+type treeIndex struct {
+	sync.RWMutex
+	tree *btree.BTree
+}
+
+func newTreeIndex() index {
+	return &treeIndex{
+		tree: btree.New(32),
+	}
+}
+
+func (ti *treeIndex) Put(key []byte, index uint64) {
+	keyi := &keyIndex{key: key}
+
+	ti.Lock()
+	defer ti.Unlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		keyi.put(index)
+		ti.tree.ReplaceOrInsert(keyi)
+		return
+	}
+	okeyi := item.(*keyIndex)
+	okeyi.put(index)
+}
+
+func (ti *treeIndex) Get(key []byte, atIndex uint64) (index uint64, err error) {
+	keyi := &keyIndex{key: key}
+
+	ti.RLock()
+	defer ti.RUnlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		return 0, ErrIndexNotFound
+	}
+
+	keyi = item.(*keyIndex)
+	return keyi.get(atIndex)
+}
+
+func (ti *treeIndex) Tombstone(key []byte, index uint64) error {
+	keyi := &keyIndex{key: key}
+
+	ti.Lock()
+	defer ti.Unlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		return ErrIndexNotFound
+	}
+
+	ki := item.(*keyIndex)
+	ki.tombstone(index)
+	return nil
+}
+
+func (ti *treeIndex) Compact(index uint64) map[uint64]struct{} {
+	available := make(map[uint64]struct{})
+	emptyki := make([]*keyIndex, 0)
+	log.Printf("store.index: compact %d", index)
+	// TODO: do not hold the lock for long time?
+	// This is probably OK. Compacting 10M keys takes O(10ms).
+	ti.Lock()
+	defer ti.Unlock()
+	ti.tree.Ascend(compactIndex(index, available, &emptyki))
+	for _, ki := range emptyki {
+		item := ti.tree.Delete(ki)
+		if item == nil {
+			log.Panic("store.index: unexpected delete failure during compaction")
+		}
+	}
+	return available
+}
+
+func compactIndex(index uint64, available map[uint64]struct{}, emptyki *[]*keyIndex) func(i btree.Item) bool {
+	return func(i btree.Item) bool {
+		keyi := i.(*keyIndex)
+		keyi.compact(index, available)
+		if keyi.isEmpty() {
+			*emptyki = append(*emptyki, keyi)
+		}
+		return true
+	}
+}

+ 137 - 0
storage/index_test.go

@@ -0,0 +1,137 @@
+package storage
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestIndexPutAndGet(t *testing.T) {
+	index := newTestTreeIndex()
+
+	tests := []T{
+		{[]byte("foo"), 0, ErrIndexNotFound, 0},
+		{[]byte("foo"), 1, nil, 1},
+		{[]byte("foo"), 3, nil, 1},
+		{[]byte("foo"), 5, nil, 5},
+		{[]byte("foo"), 6, nil, 5},
+
+		{[]byte("foo1"), 0, ErrIndexNotFound, 0},
+		{[]byte("foo1"), 1, ErrIndexNotFound, 0},
+		{[]byte("foo1"), 2, nil, 2},
+		{[]byte("foo1"), 5, nil, 2},
+		{[]byte("foo1"), 6, nil, 6},
+
+		{[]byte("foo2"), 0, ErrIndexNotFound, 0},
+		{[]byte("foo2"), 1, ErrIndexNotFound, 0},
+		{[]byte("foo2"), 3, nil, 3},
+		{[]byte("foo2"), 4, nil, 4},
+		{[]byte("foo2"), 6, nil, 4},
+	}
+	verify(t, index, tests)
+}
+
+func TestContinuousCompact(t *testing.T) {
+	index := newTestTreeIndex()
+
+	tests := []T{
+		{[]byte("foo"), 0, ErrIndexNotFound, 0},
+		{[]byte("foo"), 1, nil, 1},
+		{[]byte("foo"), 3, nil, 1},
+		{[]byte("foo"), 5, nil, 5},
+		{[]byte("foo"), 6, nil, 5},
+
+		{[]byte("foo1"), 0, ErrIndexNotFound, 0},
+		{[]byte("foo1"), 1, ErrIndexNotFound, 0},
+		{[]byte("foo1"), 2, nil, 2},
+		{[]byte("foo1"), 5, nil, 2},
+		{[]byte("foo1"), 6, nil, 6},
+
+		{[]byte("foo2"), 0, ErrIndexNotFound, 0},
+		{[]byte("foo2"), 1, ErrIndexNotFound, 0},
+		{[]byte("foo2"), 3, nil, 3},
+		{[]byte("foo2"), 4, nil, 4},
+		{[]byte("foo2"), 6, nil, 4},
+	}
+	wa := map[uint64]struct{}{
+		1: struct{}{},
+		2: struct{}{},
+		3: struct{}{},
+		4: struct{}{},
+		5: struct{}{},
+		6: struct{}{},
+	}
+	ga := index.Compact(1)
+	if !reflect.DeepEqual(ga, wa) {
+		t.Errorf("a = %v, want %v", ga, wa)
+	}
+	verify(t, index, tests)
+
+	ga = index.Compact(2)
+	if !reflect.DeepEqual(ga, wa) {
+		t.Errorf("a = %v, want %v", ga, wa)
+	}
+	verify(t, index, tests)
+
+	ga = index.Compact(3)
+	if !reflect.DeepEqual(ga, wa) {
+		t.Errorf("a = %v, want %v", ga, wa)
+	}
+	verify(t, index, tests)
+
+	ga = index.Compact(4)
+	delete(wa, 3)
+	tests[12] = T{[]byte("foo2"), 3, ErrIndexNotFound, 0}
+	if !reflect.DeepEqual(wa, ga) {
+		t.Errorf("a = %v, want %v", ga, wa)
+	}
+	verify(t, index, tests)
+
+	ga = index.Compact(5)
+	delete(wa, 1)
+	if !reflect.DeepEqual(ga, wa) {
+		t.Errorf("a = %v, want %v", ga, wa)
+	}
+	tests[1] = T{[]byte("foo"), 1, ErrIndexNotFound, 0}
+	tests[2] = T{[]byte("foo"), 3, ErrIndexNotFound, 0}
+	verify(t, index, tests)
+
+	ga = index.Compact(6)
+	delete(wa, 2)
+	if !reflect.DeepEqual(ga, wa) {
+		t.Errorf("a = %v, want %v", ga, wa)
+	}
+	tests[7] = T{[]byte("foo1"), 2, ErrIndexNotFound, 0}
+	tests[8] = T{[]byte("foo1"), 5, ErrIndexNotFound, 0}
+	verify(t, index, tests)
+}
+
+func verify(t *testing.T, index index, tests []T) {
+	for i, tt := range tests {
+		h, err := index.Get(tt.key, tt.index)
+		if err != tt.werr {
+			t.Errorf("#%d: err = %v, want %v", i, err, tt.werr)
+		}
+		if h != tt.windex {
+			t.Errorf("#%d: index = %d, want %d", i, h, tt.windex)
+		}
+	}
+}
+
+type T struct {
+	key   []byte
+	index uint64
+
+	werr   error
+	windex uint64
+}
+
+func newTestTreeIndex() index {
+	index := newTreeIndex()
+	index.Put([]byte("foo"), 1)
+	index.Put([]byte("foo1"), 2)
+	index.Put([]byte("foo2"), 3)
+	index.Put([]byte("foo2"), 4)
+	index.Put([]byte("foo"), 5)
+	index.Put([]byte("foo1"), 6)
+	return index
+}

+ 205 - 0
storage/key_index.go

@@ -0,0 +1,205 @@
+package storage
+
+import (
+	"bytes"
+	"errors"
+	"log"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/google/btree"
+)
+
+var (
+	ErrIndexNotFound = errors.New("index: not found")
+)
+
+// keyIndex stores the index of an key in the backend.
+// Each keyIndex has at least one key generation.
+// Each generation might have several key versions.
+// Tombstone on a key appends an tombstone version at the end
+// of the current generation and creates a new empty generation.
+// Each version of a key has an index pointing to the backend.
+//
+// For example: put(1);put(2);tombstone(3);put(4);tombstone(5) on key "foo"
+// generate a keyIndex:
+// key:     "foo"
+// index: 5
+// generations:
+//    {empty}
+//    {4, 5(t)}
+//    {1, 2, 3(t)}
+//
+// Compact a keyIndex removes the versions with smaller or equal to
+// index except the largest one. If the generations becomes empty
+// during compaction, it will be removed. if all the generations get
+// removed, the keyIndex Should be removed.
+
+// For example:
+// compact(2) on the previous example
+// generations:
+//    {empty}
+//    {4, 5(t)}
+//    {2, 3(t)}
+//
+// compact(4)
+// generations:
+//    {empty}
+//    {4, 5(t)}
+//
+// compact(5):
+// generations:
+//    {empty}
+//    {5(t)}
+//
+// compact(6):
+// generations:
+//    {empty} -> key SHOULD be removed.
+type keyIndex struct {
+	key         []byte
+	index       uint64
+	generations []generation
+}
+
+// put puts an index to the keyIndex.
+func (ki *keyIndex) put(index uint64) {
+	if index < ki.index {
+		log.Panicf("store.keyindex: put with unexpected smaller index [%d / %d]", index, ki.index)
+	}
+	if len(ki.generations) == 0 {
+		ki.generations = append(ki.generations, generation{})
+	}
+	g := &ki.generations[len(ki.generations)-1]
+	g.cont = append(g.cont, index)
+	g.ver++
+	ki.index = index
+}
+
+// tombstone puts an index, pointing to a tombstone, to the keyIndex.
+// It also creates a new empty generation in the keyIndex.
+func (ki *keyIndex) tombstone(index uint64) {
+	if ki.isEmpty() {
+		log.Panicf("store.keyindex: unexpected tombstone on empty keyIndex %s", string(ki.key))
+	}
+	ki.put(index)
+	ki.generations = append(ki.generations, generation{})
+}
+
+// get gets the index of thk that satisfies the given atIndex.
+// Index must be lower or equal to the given atIndex.
+func (ki *keyIndex) get(atIndex uint64) (index uint64, err error) {
+	if ki.isEmpty() {
+		log.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key))
+	}
+	g := ki.findGeneration(atIndex)
+	if g.isEmpty() {
+		return 0, ErrIndexNotFound
+	}
+
+	f := func(index, ver uint64) bool {
+		if index <= atIndex {
+			return false
+		}
+		return true
+	}
+
+	_, n := g.walk(f)
+	if n != -1 {
+		return g.cont[n], nil
+	}
+	return 0, ErrIndexNotFound
+}
+
+// compact compacts a keyIndex by removing the versions with smaller or equal
+// index than the given atIndex except the largest one.
+// If a generation becomes empty during compaction, it will be removed.
+func (ki *keyIndex) compact(atIndex uint64, available map[uint64]struct{}) {
+	if ki.isEmpty() {
+		log.Panic("store.keyindex: unexpected compact on empty keyIndex %s", string(ki.key))
+	}
+	// walk until reaching the first content that has an index smaller or equal to
+	// the atIndex.
+	// add all the reached indexes into available map.
+	f := func(index, _ uint64) bool {
+		available[index] = struct{}{}
+		if index <= atIndex {
+			return false
+		}
+		return true
+	}
+
+	g := ki.findGeneration(atIndex)
+	i := len(ki.generations) - 1
+	for i >= 0 {
+		wg := &ki.generations[i]
+		if wg == g {
+			break
+		}
+		wg.walk(f)
+		i--
+	}
+
+	_, n := g.walk(f)
+
+	// remove the previous contents.
+	if n != -1 {
+		g.cont = g.cont[n:]
+	}
+	// remove the previous generations.
+	ki.generations = ki.generations[i:]
+
+	return
+}
+
+func (ki *keyIndex) isEmpty() bool {
+	return len(ki.generations) == 1 && ki.generations[0].isEmpty()
+}
+
+// findGeneartion finds out the generation of the keyIndex that the
+// given index belongs to.
+func (ki *keyIndex) findGeneration(index uint64) *generation {
+	g, youngerg := len(ki.generations)-1, len(ki.generations)-2
+
+	// If the head index of a younger generation is smaller than
+	// the given index, the index cannot be in the younger
+	// generation.
+	for youngerg >= 0 && ki.generations[youngerg].cont != nil {
+		yg := ki.generations[youngerg]
+		if yg.cont[len(yg.cont)-1] < index {
+			break
+		}
+		g--
+		youngerg--
+	}
+	if g < 0 {
+		return nil
+	}
+	return &ki.generations[g]
+}
+
+func (a *keyIndex) Less(b btree.Item) bool {
+	return bytes.Compare(a.key, b.(*keyIndex).key) == -1
+}
+
+type generation struct {
+	ver  uint64
+	cont []uint64
+}
+
+func (g *generation) isEmpty() bool { return len(g.cont) == 0 }
+
+// walk walks through the (index, version) pairs in the generation in ascending order.
+// It passes the (index, version) to the given function.
+// walk returns until: 1. it finishs walking all pairs 2. the function returns false.
+// walk returns the (index, version) pair at where it stopped. If it stopped after
+// finishing walking, (0, -1) will be returned.
+func (g *generation) walk(f func(index, ver uint64) bool) (uint64, int) {
+	ver := g.ver
+	l := len(g.cont)
+	for i := range g.cont {
+		ok := f(g.cont[l-i-1], ver)
+		if !ok {
+			return ver, l - i - 1
+		}
+		ver--
+	}
+	return 0, -1
+}

+ 364 - 0
storage/key_index_test.go

@@ -0,0 +1,364 @@
+package storage
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestKeyIndexGet(t *testing.T) {
+	// key:     "foo"
+	// index: 12
+	// generations:
+	//    {empty}
+	//    {8[1], 10[2], 12(t)[3]}
+	//    {4[2], 6(t)[3]}
+	ki := newTestKeyIndex()
+	ki.compact(4, make(map[uint64]struct{}))
+
+	tests := []struct {
+		index uint64
+
+		windex uint64
+		werr   error
+	}{
+		// expected not exist on an index that is greater than the last tombstone
+		{13, 0, ErrIndexNotFound},
+		{13, 0, ErrIndexNotFound},
+
+		// get on generation 2
+		{12, 12, nil},
+		{11, 10, nil},
+		{10, 10, nil},
+		{9, 8, nil},
+		{8, 8, nil},
+		{7, 0, ErrIndexNotFound},
+
+		// get on generation 1
+		{6, 6, nil},
+		{5, 4, nil},
+		{4, 4, nil},
+	}
+
+	for i, tt := range tests {
+		index, err := ki.get(tt.index)
+		if err != tt.werr {
+			t.Errorf("#%d: err = %v, want %v", i, err, tt.werr)
+		}
+		if index != tt.windex {
+			t.Errorf("#%d: index = %d, want %d", i, index, tt.index)
+		}
+	}
+}
+
+func TestKeyIndexPut(t *testing.T) {
+	ki := &keyIndex{key: []byte("foo")}
+	ki.put(5)
+
+	wki := &keyIndex{
+		key:         []byte("foo"),
+		index:       5,
+		generations: []generation{{ver: 1, cont: []uint64{5}}},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+
+	ki.put(7)
+
+	wki = &keyIndex{
+		key:         []byte("foo"),
+		index:       7,
+		generations: []generation{{ver: 2, cont: []uint64{5, 7}}},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+}
+
+func TestKeyIndexTombstone(t *testing.T) {
+	ki := &keyIndex{key: []byte("foo")}
+	ki.put(5)
+
+	ki.tombstone(7)
+
+	wki := &keyIndex{
+		key:         []byte("foo"),
+		index:       7,
+		generations: []generation{{ver: 2, cont: []uint64{5, 7}}, {}},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+
+	ki.put(8)
+	ki.put(9)
+	ki.tombstone(15)
+
+	wki = &keyIndex{
+		key:         []byte("foo"),
+		index:       15,
+		generations: []generation{{ver: 2, cont: []uint64{5, 7}}, {ver: 3, cont: []uint64{8, 9, 15}}, {}},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+}
+
+func TestKeyIndexCompact(t *testing.T) {
+	tests := []struct {
+		compact uint64
+
+		wki *keyIndex
+		wam map[uint64]struct{}
+	}{
+		{
+			1,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{2, 4, 6}},
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				2: struct{}{}, 4: struct{}{}, 6: struct{}{},
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			2,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{2, 4, 6}},
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				2: struct{}{}, 4: struct{}{}, 6: struct{}{},
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			3,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{2, 4, 6}},
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				2: struct{}{}, 4: struct{}{}, 6: struct{}{},
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			4,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{4, 6}},
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				4: struct{}{}, 6: struct{}{},
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			5,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{4, 6}},
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				4: struct{}{}, 6: struct{}{},
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			6,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{6}},
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				6: struct{}{},
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			7,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			8,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			9,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{8, 10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				8: struct{}{}, 10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			10,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			11,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{10, 12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				10: struct{}{}, 12: struct{}{},
+			},
+		},
+		{
+			12,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{ver: 3, cont: []uint64{12}},
+					{},
+				},
+			},
+			map[uint64]struct{}{
+				12: struct{}{},
+			},
+		},
+		{
+			13,
+			&keyIndex{
+				key:   []byte("foo"),
+				index: 12,
+				generations: []generation{
+					{},
+				},
+			},
+			map[uint64]struct{}{},
+		},
+	}
+
+	// Continous Compaction
+	ki := newTestKeyIndex()
+	for i, tt := range tests {
+		am := make(map[uint64]struct{})
+		ki.compact(tt.compact, am)
+		if !reflect.DeepEqual(ki, tt.wki) {
+			t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
+		}
+		if !reflect.DeepEqual(am, tt.wam) {
+			t.Errorf("#%d: am = %+v, want %+v", am, tt.wam)
+		}
+	}
+
+	// Jump Compaction
+	for i, tt := range tests {
+		if (i%2 == 0 && i < 6) && (i%2 == 1 && i > 6) {
+			am := make(map[uint64]struct{})
+			ki.compact(tt.compact, am)
+			if !reflect.DeepEqual(ki, tt.wki) {
+				t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
+			}
+			if !reflect.DeepEqual(am, tt.wam) {
+				t.Errorf("#%d: am = %+v, want %+v", am, tt.wam)
+			}
+		}
+	}
+
+	// OnceCompaction
+	for i, tt := range tests {
+		ki := newTestKeyIndex()
+		am := make(map[uint64]struct{})
+		ki.compact(tt.compact, am)
+		if !reflect.DeepEqual(ki, tt.wki) {
+			t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
+		}
+		if !reflect.DeepEqual(am, tt.wam) {
+			t.Errorf("#%d: am = %+v, want %+v", am, tt.wam)
+		}
+	}
+}
+
+func newTestKeyIndex() *keyIndex {
+	// key:     "foo"
+	// index: 12
+	// generations:
+	//    {empty}
+	//    {8[1], 10[2], 12(t)[3]}
+	//    {2[1], 4[2], 6(t)[3]}
+
+	ki := &keyIndex{key: []byte("foo")}
+	ki.put(2)
+	ki.put(4)
+	ki.tombstone(6)
+	ki.put(8)
+	ki.put(10)
+	ki.tombstone(12)
+	return ki
+}

+ 132 - 0
storage/kv.go

@@ -0,0 +1,132 @@
+package storage
+
+import (
+	"encoding/binary"
+	"log"
+	"sync"
+	"time"
+
+	"github.com/coreos/etcd/storage/backend"
+	"github.com/coreos/etcd/storage/storagepb"
+)
+
+var (
+	batchLimit    = 10000
+	batchInterval = 100 * time.Millisecond
+	keyBucketName = []byte("key")
+)
+
+type store struct {
+	// read operation MUST hold read lock
+	// write opeartion MUST hold write lock
+	sync.RWMutex
+
+	b       backend.Backend
+	kvindex index
+
+	currentIndex uint64
+	marshalBuf   []byte // buffer for marshal protobuf
+}
+
+func newStore(path string) *store {
+	s := &store{
+		b:            backend.New(path, batchInterval, batchLimit),
+		kvindex:      newTreeIndex(),
+		currentIndex: 0,
+		marshalBuf:   make([]byte, 1024*1024),
+	}
+
+	tx := s.b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket(keyBucketName)
+	tx.Unlock()
+	s.b.ForceCommit()
+
+	return s
+}
+
+func (s *store) Put(key, value []byte) {
+	s.Lock()
+	defer s.Unlock()
+
+	currentIndex := s.currentIndex + 1
+
+	ibytes := make([]byte, 8)
+	binary.BigEndian.PutUint64(ibytes, currentIndex)
+
+	tx := s.b.BatchTx()
+	tx.Lock()
+	defer tx.Unlock()
+	s.currentIndex = currentIndex
+
+	event := storagepb.Event{
+		Type: storagepb.PUT,
+		Kv: storagepb.KeyValue{
+			Key:   key,
+			Value: value,
+		},
+	}
+
+	var (
+		d   []byte
+		err error
+		n   int
+	)
+
+	if event.Size() < len(s.marshalBuf) {
+		n, err = event.MarshalTo(s.marshalBuf)
+		d = s.marshalBuf[:n]
+	} else {
+		d, err = event.Marshal()
+	}
+	if err != nil {
+		log.Fatalf("storage: cannot marshal event: %v", err)
+	}
+
+	tx.UnsafePut(keyBucketName, ibytes, d)
+
+	s.kvindex.Put(key, currentIndex)
+}
+
+func (s *store) Get(key []byte) []byte {
+	s.RLock()
+	defer s.RUnlock()
+
+	index, err := s.kvindex.Get(key, s.currentIndex)
+	if err != nil {
+		return nil
+	}
+
+	ibytes := make([]byte, 8)
+	binary.BigEndian.PutUint64(ibytes, index)
+	tx := s.b.BatchTx()
+	tx.Lock()
+	defer tx.Unlock()
+	vs := tx.UnsafeRange(keyBucketName, ibytes, nil, 0)
+	// TODO: the value will be an event type.
+	// TODO: copy out the bytes, decode it, return the value.
+	return vs[0]
+}
+
+func (s *store) Delete(key []byte) error {
+	s.Lock()
+	defer s.Unlock()
+
+	_, err := s.kvindex.Get(key, s.currentIndex)
+	if err != nil {
+		return nil
+	}
+
+	currentIndex := s.currentIndex + 1
+
+	ibytes := make([]byte, 8)
+	binary.BigEndian.PutUint64(ibytes, currentIndex)
+	tx := s.b.BatchTx()
+	tx.Lock()
+	defer tx.Unlock()
+	// TODO: the value will be an event type.
+	// A tombstone is simple a "Delete" type event.
+	tx.UnsafePut(keyBucketName, key, []byte("tombstone"))
+
+	return s.kvindex.Tombstone(key, currentIndex)
+}

+ 24 - 0
storage/kv_test.go

@@ -0,0 +1,24 @@
+package storage
+
+import (
+	"crypto/rand"
+	"os"
+	"testing"
+)
+
+func BenchmarkStorePut(b *testing.B) {
+	s := newStore("test")
+	defer os.Remove("test")
+
+	// prepare keys
+	keys := make([][]byte, b.N)
+	for i := 0; i < b.N; i++ {
+		keys[i] = make([]byte, 64)
+		rand.Read(keys[i])
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Put(keys[i], []byte("foo"))
+	}
+}

+ 456 - 0
storage/storagepb/kv.pb.go

@@ -0,0 +1,456 @@
+// Code generated by protoc-gen-gogo.
+// source: kv.proto
+// DO NOT EDIT!
+
+/*
+	Package storagepb is a generated protocol buffer package.
+
+	It is generated from these files:
+		kv.proto
+
+	It has these top-level messages:
+		KeyValue
+		Event
+*/
+package storagepb
+
+import proto "github.com/coreos/etcd/Godeps/_workspace/src/github.com/gogo/protobuf/proto"
+import math "math"
+
+// discarding unused import gogoproto "github.com/gogo/protobuf/gogoproto/gogo.pb"
+
+import io "io"
+import fmt "fmt"
+import github_com_gogo_protobuf_proto "github.com/coreos/etcd/Godeps/_workspace/src/github.com/gogo/protobuf/proto"
+
+// Reference imports to suppress errors if they are not otherwise used.
+var _ = proto.Marshal
+var _ = math.Inf
+
+type Event_EventType int32
+
+const (
+	PUT    Event_EventType = 0
+	DELETE Event_EventType = 1
+	EXPIRE Event_EventType = 2
+)
+
+var Event_EventType_name = map[int32]string{
+	0: "PUT",
+	1: "DELETE",
+	2: "EXPIRE",
+}
+var Event_EventType_value = map[string]int32{
+	"PUT":    0,
+	"DELETE": 1,
+	"EXPIRE": 2,
+}
+
+func (x Event_EventType) Enum() *Event_EventType {
+	p := new(Event_EventType)
+	*p = x
+	return p
+}
+func (x Event_EventType) String() string {
+	return proto.EnumName(Event_EventType_name, int32(x))
+}
+func (x *Event_EventType) UnmarshalJSON(data []byte) error {
+	value, err := proto.UnmarshalJSONEnum(Event_EventType_value, data, "Event_EventType")
+	if err != nil {
+		return err
+	}
+	*x = Event_EventType(value)
+	return nil
+}
+
+type KeyValue struct {
+	Key []byte `protobuf:"bytes,1,opt,name=key" json:"key"`
+	// mod_index is the last modified index of the key.
+	CreateIndex int64 `protobuf:"varint,2,opt,name=create_index" json:"create_index"`
+	ModIndex    int64 `protobuf:"varint,3,opt,name=mod_index" json:"mod_index"`
+	// version is the version of the key. A deletion resets
+	// the version to zero and any modification of the key
+	// increases its version.
+	Version          int64  `protobuf:"varint,4,opt,name=version" json:"version"`
+	Value            []byte `protobuf:"bytes,5,opt,name=value" json:"value"`
+	XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *KeyValue) Reset()         { *m = KeyValue{} }
+func (m *KeyValue) String() string { return proto.CompactTextString(m) }
+func (*KeyValue) ProtoMessage()    {}
+
+type Event struct {
+	Type Event_EventType `protobuf:"varint,1,opt,name=type,enum=storagepb.Event_EventType" json:"type"`
+	// a put event contains the current key-value
+	// a delete/expire event contains the previous
+	// key-value
+	Kv               KeyValue `protobuf:"bytes,2,opt,name=kv" json:"kv"`
+	XXX_unrecognized []byte   `json:"-"`
+}
+
+func (m *Event) Reset()         { *m = Event{} }
+func (m *Event) String() string { return proto.CompactTextString(m) }
+func (*Event) ProtoMessage()    {}
+
+func init() {
+	proto.RegisterEnum("storagepb.Event_EventType", Event_EventType_name, Event_EventType_value)
+}
+func (m *KeyValue) Unmarshal(data []byte) error {
+	l := len(data)
+	index := 0
+	for index < l {
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if index >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := data[index]
+			index++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if index >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[index]
+				index++
+				byteLen |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			postIndex := index + byteLen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Key = append([]byte{}, data[index:postIndex]...)
+			index = postIndex
+		case 2:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field CreateIndex", wireType)
+			}
+			for shift := uint(0); ; shift += 7 {
+				if index >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[index]
+				index++
+				m.CreateIndex |= (int64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 3:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field ModIndex", wireType)
+			}
+			for shift := uint(0); ; shift += 7 {
+				if index >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[index]
+				index++
+				m.ModIndex |= (int64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 4:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Version", wireType)
+			}
+			for shift := uint(0); ; shift += 7 {
+				if index >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[index]
+				index++
+				m.Version |= (int64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 5:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if index >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[index]
+				index++
+				byteLen |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			postIndex := index + byteLen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Value = append([]byte{}, data[index:postIndex]...)
+			index = postIndex
+		default:
+			var sizeOfWire int
+			for {
+				sizeOfWire++
+				wire >>= 7
+				if wire == 0 {
+					break
+				}
+			}
+			index -= sizeOfWire
+			skippy, err := github_com_gogo_protobuf_proto.Skip(data[index:])
+			if err != nil {
+				return err
+			}
+			if (index + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.XXX_unrecognized = append(m.XXX_unrecognized, data[index:index+skippy]...)
+			index += skippy
+		}
+	}
+	return nil
+}
+func (m *Event) Unmarshal(data []byte) error {
+	l := len(data)
+	index := 0
+	for index < l {
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if index >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := data[index]
+			index++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		switch fieldNum {
+		case 1:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Type", wireType)
+			}
+			for shift := uint(0); ; shift += 7 {
+				if index >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[index]
+				index++
+				m.Type |= (Event_EventType(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 2:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Kv", wireType)
+			}
+			var msglen int
+			for shift := uint(0); ; shift += 7 {
+				if index >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[index]
+				index++
+				msglen |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			postIndex := index + msglen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			if err := m.Kv.Unmarshal(data[index:postIndex]); err != nil {
+				return err
+			}
+			index = postIndex
+		default:
+			var sizeOfWire int
+			for {
+				sizeOfWire++
+				wire >>= 7
+				if wire == 0 {
+					break
+				}
+			}
+			index -= sizeOfWire
+			skippy, err := github_com_gogo_protobuf_proto.Skip(data[index:])
+			if err != nil {
+				return err
+			}
+			if (index + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.XXX_unrecognized = append(m.XXX_unrecognized, data[index:index+skippy]...)
+			index += skippy
+		}
+	}
+	return nil
+}
+func (m *KeyValue) Size() (n int) {
+	var l int
+	_ = l
+	if m.Key != nil {
+		l = len(m.Key)
+		n += 1 + l + sovKv(uint64(l))
+	}
+	n += 1 + sovKv(uint64(m.CreateIndex))
+	n += 1 + sovKv(uint64(m.ModIndex))
+	n += 1 + sovKv(uint64(m.Version))
+	if m.Value != nil {
+		l = len(m.Value)
+		n += 1 + l + sovKv(uint64(l))
+	}
+	if m.XXX_unrecognized != nil {
+		n += len(m.XXX_unrecognized)
+	}
+	return n
+}
+
+func (m *Event) Size() (n int) {
+	var l int
+	_ = l
+	n += 1 + sovKv(uint64(m.Type))
+	l = m.Kv.Size()
+	n += 1 + l + sovKv(uint64(l))
+	if m.XXX_unrecognized != nil {
+		n += len(m.XXX_unrecognized)
+	}
+	return n
+}
+
+func sovKv(x uint64) (n int) {
+	for {
+		n++
+		x >>= 7
+		if x == 0 {
+			break
+		}
+	}
+	return n
+}
+func sozKv(x uint64) (n int) {
+	return sovKv(uint64((x << 1) ^ uint64((int64(x) >> 63))))
+}
+func (m *KeyValue) Marshal() (data []byte, err error) {
+	size := m.Size()
+	data = make([]byte, size)
+	n, err := m.MarshalTo(data)
+	if err != nil {
+		return nil, err
+	}
+	return data[:n], nil
+}
+
+func (m *KeyValue) MarshalTo(data []byte) (n int, err error) {
+	var i int
+	_ = i
+	var l int
+	_ = l
+	if m.Key != nil {
+		data[i] = 0xa
+		i++
+		i = encodeVarintKv(data, i, uint64(len(m.Key)))
+		i += copy(data[i:], m.Key)
+	}
+	data[i] = 0x10
+	i++
+	i = encodeVarintKv(data, i, uint64(m.CreateIndex))
+	data[i] = 0x18
+	i++
+	i = encodeVarintKv(data, i, uint64(m.ModIndex))
+	data[i] = 0x20
+	i++
+	i = encodeVarintKv(data, i, uint64(m.Version))
+	if m.Value != nil {
+		data[i] = 0x2a
+		i++
+		i = encodeVarintKv(data, i, uint64(len(m.Value)))
+		i += copy(data[i:], m.Value)
+	}
+	if m.XXX_unrecognized != nil {
+		i += copy(data[i:], m.XXX_unrecognized)
+	}
+	return i, nil
+}
+
+func (m *Event) Marshal() (data []byte, err error) {
+	size := m.Size()
+	data = make([]byte, size)
+	n, err := m.MarshalTo(data)
+	if err != nil {
+		return nil, err
+	}
+	return data[:n], nil
+}
+
+func (m *Event) MarshalTo(data []byte) (n int, err error) {
+	var i int
+	_ = i
+	var l int
+	_ = l
+	data[i] = 0x8
+	i++
+	i = encodeVarintKv(data, i, uint64(m.Type))
+	data[i] = 0x12
+	i++
+	i = encodeVarintKv(data, i, uint64(m.Kv.Size()))
+	n1, err := m.Kv.MarshalTo(data[i:])
+	if err != nil {
+		return 0, err
+	}
+	i += n1
+	if m.XXX_unrecognized != nil {
+		i += copy(data[i:], m.XXX_unrecognized)
+	}
+	return i, nil
+}
+
+func encodeFixed64Kv(data []byte, offset int, v uint64) int {
+	data[offset] = uint8(v)
+	data[offset+1] = uint8(v >> 8)
+	data[offset+2] = uint8(v >> 16)
+	data[offset+3] = uint8(v >> 24)
+	data[offset+4] = uint8(v >> 32)
+	data[offset+5] = uint8(v >> 40)
+	data[offset+6] = uint8(v >> 48)
+	data[offset+7] = uint8(v >> 56)
+	return offset + 8
+}
+func encodeFixed32Kv(data []byte, offset int, v uint32) int {
+	data[offset] = uint8(v)
+	data[offset+1] = uint8(v >> 8)
+	data[offset+2] = uint8(v >> 16)
+	data[offset+3] = uint8(v >> 24)
+	return offset + 4
+}
+func encodeVarintKv(data []byte, offset int, v uint64) int {
+	for v >= 1<<7 {
+		data[offset] = uint8(v&0x7f | 0x80)
+		v >>= 7
+		offset++
+	}
+	data[offset] = uint8(v)
+	return offset + 1
+}

+ 35 - 0
storage/storagepb/kv.proto

@@ -0,0 +1,35 @@
+package storagepb;
+
+import "github.com/gogo/protobuf/gogoproto/gogo.proto";
+
+option (gogoproto.marshaler_all) = true;
+option (gogoproto.sizer_all) = true;
+option (gogoproto.unmarshaler_all) = true;
+option (gogoproto.goproto_getters_all) = false;
+option (gogoproto.goproto_enum_prefix_all) = false;
+
+message KeyValue {
+  optional bytes key = 1 [(gogoproto.nullable) = false];
+  // mod_index is the last modified index of the key.
+  optional int64 create_index = 2 [(gogoproto.nullable) = false];
+  optional int64 mod_index = 3 [(gogoproto.nullable) = false];
+  // version is the version of the key. A deletion resets
+  // the version to zero and any modification of the key
+  // increases its version.
+  optional int64 version = 4 [(gogoproto.nullable) = false];
+  optional bytes value = 5 [(gogoproto.nullable) = false];
+}
+
+message Event {
+  enum EventType {
+    PUT = 0;
+    DELETE = 1;
+    EXPIRE = 2;
+  }
+  optional EventType type = 1 [(gogoproto.nullable) = false];
+  // a put event contains the current key-value
+  // a delete/expire event contains the previous
+  // key-value
+  optional KeyValue kv = 2 [(gogoproto.nullable) = false];
+}
+