123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748 |
- package bolt
- import (
- "bytes"
- "fmt"
- "unsafe"
- )
- const (
- // MaxKeySize is the maximum length of a key, in bytes.
- MaxKeySize = 32768
- // MaxValueSize is the maximum length of a value, in bytes.
- MaxValueSize = (1 << 31) - 2
- )
- const (
- maxUint = ^uint(0)
- minUint = 0
- maxInt = int(^uint(0) >> 1)
- minInt = -maxInt - 1
- )
- const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
- const (
- minFillPercent = 0.1
- maxFillPercent = 1.0
- )
- // DefaultFillPercent is the percentage that split pages are filled.
- // This value can be changed by setting Bucket.FillPercent.
- const DefaultFillPercent = 0.5
- // Bucket represents a collection of key/value pairs inside the database.
- type Bucket struct {
- *bucket
- tx *Tx // the associated transaction
- buckets map[string]*Bucket // subbucket cache
- page *page // inline page reference
- rootNode *node // materialized node for the root page.
- nodes map[pgid]*node // node cache
- // Sets the threshold for filling nodes when they split. By default,
- // the bucket will fill to 50% but it can be useful to increase this
- // amount if you know that your write workloads are mostly append-only.
- //
- // This is non-persisted across transactions so it must be set in every Tx.
- FillPercent float64
- }
- // bucket represents the on-file representation of a bucket.
- // This is stored as the "value" of a bucket key. If the bucket is small enough,
- // then its root page can be stored inline in the "value", after the bucket
- // header. In the case of inline buckets, the "root" will be 0.
- type bucket struct {
- root pgid // page id of the bucket's root-level page
- sequence uint64 // monotonically incrementing, used by NextSequence()
- }
- // newBucket returns a new bucket associated with a transaction.
- func newBucket(tx *Tx) Bucket {
- var b = Bucket{tx: tx, FillPercent: DefaultFillPercent}
- if tx.writable {
- b.buckets = make(map[string]*Bucket)
- b.nodes = make(map[pgid]*node)
- }
- return b
- }
- // Tx returns the tx of the bucket.
- func (b *Bucket) Tx() *Tx {
- return b.tx
- }
- // Root returns the root of the bucket.
- func (b *Bucket) Root() pgid {
- return b.root
- }
- // Writable returns whether the bucket is writable.
- func (b *Bucket) Writable() bool {
- return b.tx.writable
- }
- // Cursor creates a cursor associated with the bucket.
- // The cursor is only valid as long as the transaction is open.
- // Do not use a cursor after the transaction is closed.
- func (b *Bucket) Cursor() *Cursor {
- // Update transaction statistics.
- b.tx.stats.CursorCount++
- // Allocate and return a cursor.
- return &Cursor{
- bucket: b,
- stack: make([]elemRef, 0),
- }
- }
- // Bucket retrieves a nested bucket by name.
- // Returns nil if the bucket does not exist.
- // The bucket instance is only valid for the lifetime of the transaction.
- func (b *Bucket) Bucket(name []byte) *Bucket {
- if b.buckets != nil {
- if child := b.buckets[string(name)]; child != nil {
- return child
- }
- }
- // Move cursor to key.
- c := b.Cursor()
- k, v, flags := c.seek(name)
- // Return nil if the key doesn't exist or it is not a bucket.
- if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
- return nil
- }
- // Otherwise create a bucket and cache it.
- var child = b.openBucket(v)
- if b.buckets != nil {
- b.buckets[string(name)] = child
- }
- return child
- }
- // Helper method that re-interprets a sub-bucket value
- // from a parent into a Bucket
- func (b *Bucket) openBucket(value []byte) *Bucket {
- var child = newBucket(b.tx)
- // If this is a writable transaction then we need to copy the bucket entry.
- // Read-only transactions can point directly at the mmap entry.
- if b.tx.writable {
- child.bucket = &bucket{}
- *child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
- } else {
- child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
- }
- // Save a reference to the inline page if the bucket is inline.
- if child.root == 0 {
- child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
- }
- return &child
- }
- // CreateBucket creates a new bucket at the given key and returns the new bucket.
- // Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long.
- // The bucket instance is only valid for the lifetime of the transaction.
- func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
- if b.tx.db == nil {
- return nil, ErrTxClosed
- } else if !b.tx.writable {
- return nil, ErrTxNotWritable
- } else if len(key) == 0 {
- return nil, ErrBucketNameRequired
- }
- // Move cursor to correct position.
- c := b.Cursor()
- k, _, flags := c.seek(key)
- // Return an error if there is an existing key.
- if bytes.Equal(key, k) {
- if (flags & bucketLeafFlag) != 0 {
- return nil, ErrBucketExists
- } else {
- return nil, ErrIncompatibleValue
- }
- }
- // Create empty, inline bucket.
- var bucket = Bucket{
- bucket: &bucket{},
- rootNode: &node{isLeaf: true},
- FillPercent: DefaultFillPercent,
- }
- var value = bucket.write()
- // Insert into node.
- key = cloneBytes(key)
- c.node().put(key, key, value, 0, bucketLeafFlag)
- // Since subbuckets are not allowed on inline buckets, we need to
- // dereference the inline page, if it exists. This will cause the bucket
- // to be treated as a regular, non-inline bucket for the rest of the tx.
- b.page = nil
- return b.Bucket(key), nil
- }
- // CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it.
- // Returns an error if the bucket name is blank, or if the bucket name is too long.
- // The bucket instance is only valid for the lifetime of the transaction.
- func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
- child, err := b.CreateBucket(key)
- if err == ErrBucketExists {
- return b.Bucket(key), nil
- } else if err != nil {
- return nil, err
- }
- return child, nil
- }
- // DeleteBucket deletes a bucket at the given key.
- // Returns an error if the bucket does not exists, or if the key represents a non-bucket value.
- func (b *Bucket) DeleteBucket(key []byte) error {
- if b.tx.db == nil {
- return ErrTxClosed
- } else if !b.Writable() {
- return ErrTxNotWritable
- }
- // Move cursor to correct position.
- c := b.Cursor()
- k, _, flags := c.seek(key)
- // Return an error if bucket doesn't exist or is not a bucket.
- if !bytes.Equal(key, k) {
- return ErrBucketNotFound
- } else if (flags & bucketLeafFlag) == 0 {
- return ErrIncompatibleValue
- }
- // Recursively delete all child buckets.
- child := b.Bucket(key)
- err := child.ForEach(func(k, v []byte) error {
- if v == nil {
- if err := child.DeleteBucket(k); err != nil {
- return fmt.Errorf("delete bucket: %s", err)
- }
- }
- return nil
- })
- if err != nil {
- return err
- }
- // Remove cached copy.
- delete(b.buckets, string(key))
- // Release all bucket pages to freelist.
- child.nodes = nil
- child.rootNode = nil
- child.free()
- // Delete the node if we have a matching key.
- c.node().del(key)
- return nil
- }
- // Get retrieves the value for a key in the bucket.
- // Returns a nil value if the key does not exist or if the key is a nested bucket.
- // The returned value is only valid for the life of the transaction.
- func (b *Bucket) Get(key []byte) []byte {
- k, v, flags := b.Cursor().seek(key)
- // Return nil if this is a bucket.
- if (flags & bucketLeafFlag) != 0 {
- return nil
- }
- // If our target node isn't the same key as what's passed in then return nil.
- if !bytes.Equal(key, k) {
- return nil
- }
- return v
- }
- // Put sets the value for a key in the bucket.
- // If the key exist then its previous value will be overwritten.
- // Supplied value must remain valid for the life of the transaction.
- // Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.
- func (b *Bucket) Put(key []byte, value []byte) error {
- if b.tx.db == nil {
- return ErrTxClosed
- } else if !b.Writable() {
- return ErrTxNotWritable
- } else if len(key) == 0 {
- return ErrKeyRequired
- } else if len(key) > MaxKeySize {
- return ErrKeyTooLarge
- } else if int64(len(value)) > MaxValueSize {
- return ErrValueTooLarge
- }
- // Move cursor to correct position.
- c := b.Cursor()
- k, _, flags := c.seek(key)
- // Return an error if there is an existing key with a bucket value.
- if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 {
- return ErrIncompatibleValue
- }
- // Insert into node.
- key = cloneBytes(key)
- c.node().put(key, key, value, 0, 0)
- return nil
- }
- // Delete removes a key from the bucket.
- // If the key does not exist then nothing is done and a nil error is returned.
- // Returns an error if the bucket was created from a read-only transaction.
- func (b *Bucket) Delete(key []byte) error {
- if b.tx.db == nil {
- return ErrTxClosed
- } else if !b.Writable() {
- return ErrTxNotWritable
- }
- // Move cursor to correct position.
- c := b.Cursor()
- _, _, flags := c.seek(key)
- // Return an error if there is already existing bucket value.
- if (flags & bucketLeafFlag) != 0 {
- return ErrIncompatibleValue
- }
- // Delete the node if we have a matching key.
- c.node().del(key)
- return nil
- }
- // NextSequence returns an autoincrementing integer for the bucket.
- func (b *Bucket) NextSequence() (uint64, error) {
- if b.tx.db == nil {
- return 0, ErrTxClosed
- } else if !b.Writable() {
- return 0, ErrTxNotWritable
- }
- // Materialize the root node if it hasn't been already so that the
- // bucket will be saved during commit.
- if b.rootNode == nil {
- _ = b.node(b.root, nil)
- }
- // Increment and return the sequence.
- b.bucket.sequence++
- return b.bucket.sequence, nil
- }
- // ForEach executes a function for each key/value pair in a bucket.
- // If the provided function returns an error then the iteration is stopped and
- // the error is returned to the caller. The provided function must not modify
- // the bucket; this will result in undefined behavior.
- func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
- if b.tx.db == nil {
- return ErrTxClosed
- }
- c := b.Cursor()
- for k, v := c.First(); k != nil; k, v = c.Next() {
- if err := fn(k, v); err != nil {
- return err
- }
- }
- return nil
- }
- // Stat returns stats on a bucket.
- func (b *Bucket) Stats() BucketStats {
- var s, subStats BucketStats
- pageSize := b.tx.db.pageSize
- s.BucketN += 1
- if b.root == 0 {
- s.InlineBucketN += 1
- }
- b.forEachPage(func(p *page, depth int) {
- if (p.flags & leafPageFlag) != 0 {
- s.KeyN += int(p.count)
- // used totals the used bytes for the page
- used := pageHeaderSize
- if p.count != 0 {
- // If page has any elements, add all element headers.
- used += leafPageElementSize * int(p.count-1)
- // Add all element key, value sizes.
- // The computation takes advantage of the fact that the position
- // of the last element's key/value equals to the total of the sizes
- // of all previous elements' keys and values.
- // It also includes the last element's header.
- lastElement := p.leafPageElement(p.count - 1)
- used += int(lastElement.pos + lastElement.ksize + lastElement.vsize)
- }
- if b.root == 0 {
- // For inlined bucket just update the inline stats
- s.InlineBucketInuse += used
- } else {
- // For non-inlined bucket update all the leaf stats
- s.LeafPageN++
- s.LeafInuse += used
- s.LeafOverflowN += int(p.overflow)
- // Collect stats from sub-buckets.
- // Do that by iterating over all element headers
- // looking for the ones with the bucketLeafFlag.
- for i := uint16(0); i < p.count; i++ {
- e := p.leafPageElement(i)
- if (e.flags & bucketLeafFlag) != 0 {
- // For any bucket element, open the element value
- // and recursively call Stats on the contained bucket.
- subStats.Add(b.openBucket(e.value()).Stats())
- }
- }
- }
- } else if (p.flags & branchPageFlag) != 0 {
- s.BranchPageN++
- lastElement := p.branchPageElement(p.count - 1)
- // used totals the used bytes for the page
- // Add header and all element headers.
- used := pageHeaderSize + (branchPageElementSize * int(p.count-1))
- // Add size of all keys and values.
- // Again, use the fact that last element's position equals to
- // the total of key, value sizes of all previous elements.
- used += int(lastElement.pos + lastElement.ksize)
- s.BranchInuse += used
- s.BranchOverflowN += int(p.overflow)
- }
- // Keep track of maximum page depth.
- if depth+1 > s.Depth {
- s.Depth = (depth + 1)
- }
- })
- // Alloc stats can be computed from page counts and pageSize.
- s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize
- s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize
- // Add the max depth of sub-buckets to get total nested depth.
- s.Depth += subStats.Depth
- // Add the stats for all sub-buckets
- s.Add(subStats)
- return s
- }
- // forEachPage iterates over every page in a bucket, including inline pages.
- func (b *Bucket) forEachPage(fn func(*page, int)) {
- // If we have an inline page then just use that.
- if b.page != nil {
- fn(b.page, 0)
- return
- }
- // Otherwise traverse the page hierarchy.
- b.tx.forEachPage(b.root, 0, fn)
- }
- // forEachPageNode iterates over every page (or node) in a bucket.
- // This also includes inline pages.
- func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) {
- // If we have an inline page or root node then just use that.
- if b.page != nil {
- fn(b.page, nil, 0)
- return
- }
- b._forEachPageNode(b.root, 0, fn)
- }
- func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) {
- var p, n = b.pageNode(pgid)
- // Execute function.
- fn(p, n, depth)
- // Recursively loop over children.
- if p != nil {
- if (p.flags & branchPageFlag) != 0 {
- for i := 0; i < int(p.count); i++ {
- elem := p.branchPageElement(uint16(i))
- b._forEachPageNode(elem.pgid, depth+1, fn)
- }
- }
- } else {
- if !n.isLeaf {
- for _, inode := range n.inodes {
- b._forEachPageNode(inode.pgid, depth+1, fn)
- }
- }
- }
- }
- // spill writes all the nodes for this bucket to dirty pages.
- func (b *Bucket) spill() error {
- // Spill all child buckets first.
- for name, child := range b.buckets {
- // If the child bucket is small enough and it has no child buckets then
- // write it inline into the parent bucket's page. Otherwise spill it
- // like a normal bucket and make the parent value a pointer to the page.
- var value []byte
- if child.inlineable() {
- child.free()
- value = child.write()
- } else {
- if err := child.spill(); err != nil {
- return err
- }
- // Update the child bucket header in this bucket.
- value = make([]byte, unsafe.Sizeof(bucket{}))
- var bucket = (*bucket)(unsafe.Pointer(&value[0]))
- *bucket = *child.bucket
- }
- // Skip writing the bucket if there are no materialized nodes.
- if child.rootNode == nil {
- continue
- }
- // Update parent node.
- var c = b.Cursor()
- k, _, flags := c.seek([]byte(name))
- if !bytes.Equal([]byte(name), k) {
- panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k))
- }
- if flags&bucketLeafFlag == 0 {
- panic(fmt.Sprintf("unexpected bucket header flag: %x", flags))
- }
- c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
- }
- // Ignore if there's not a materialized root node.
- if b.rootNode == nil {
- return nil
- }
- // Spill nodes.
- if err := b.rootNode.spill(); err != nil {
- return err
- }
- b.rootNode = b.rootNode.root()
- // Update the root node for this bucket.
- if b.rootNode.pgid >= b.tx.meta.pgid {
- panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid))
- }
- b.root = b.rootNode.pgid
- return nil
- }
- // inlineable returns true if a bucket is small enough to be written inline
- // and if it contains no subbuckets. Otherwise returns false.
- func (b *Bucket) inlineable() bool {
- var n = b.rootNode
- // Bucket must only contain a single leaf node.
- if n == nil || !n.isLeaf {
- return false
- }
- // Bucket is not inlineable if it contains subbuckets or if it goes beyond
- // our threshold for inline bucket size.
- var size = pageHeaderSize
- for _, inode := range n.inodes {
- size += leafPageElementSize + len(inode.key) + len(inode.value)
- if inode.flags&bucketLeafFlag != 0 {
- return false
- } else if size > b.maxInlineBucketSize() {
- return false
- }
- }
- return true
- }
- // Returns the maximum total size of a bucket to make it a candidate for inlining.
- func (b *Bucket) maxInlineBucketSize() int {
- return b.tx.db.pageSize / 4
- }
- // write allocates and writes a bucket to a byte slice.
- func (b *Bucket) write() []byte {
- // Allocate the appropriate size.
- var n = b.rootNode
- var value = make([]byte, bucketHeaderSize+n.size())
- // Write a bucket header.
- var bucket = (*bucket)(unsafe.Pointer(&value[0]))
- *bucket = *b.bucket
- // Convert byte slice to a fake page and write the root node.
- var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
- n.write(p)
- return value
- }
- // rebalance attempts to balance all nodes.
- func (b *Bucket) rebalance() {
- for _, n := range b.nodes {
- n.rebalance()
- }
- for _, child := range b.buckets {
- child.rebalance()
- }
- }
- // node creates a node from a page and associates it with a given parent.
- func (b *Bucket) node(pgid pgid, parent *node) *node {
- _assert(b.nodes != nil, "nodes map expected")
- // Retrieve node if it's already been created.
- if n := b.nodes[pgid]; n != nil {
- return n
- }
- // Otherwise create a node and cache it.
- n := &node{bucket: b, parent: parent}
- if parent == nil {
- b.rootNode = n
- } else {
- parent.children = append(parent.children, n)
- }
- // Use the inline page if this is an inline bucket.
- var p = b.page
- if p == nil {
- p = b.tx.page(pgid)
- }
- // Read the page into the node and cache it.
- n.read(p)
- b.nodes[pgid] = n
- // Update statistics.
- b.tx.stats.NodeCount++
- return n
- }
- // free recursively frees all pages in the bucket.
- func (b *Bucket) free() {
- if b.root == 0 {
- return
- }
- var tx = b.tx
- b.forEachPageNode(func(p *page, n *node, _ int) {
- if p != nil {
- tx.db.freelist.free(tx.meta.txid, p)
- } else {
- n.free()
- }
- })
- b.root = 0
- }
- // dereference removes all references to the old mmap.
- func (b *Bucket) dereference() {
- if b.rootNode != nil {
- b.rootNode.root().dereference()
- }
- for _, child := range b.buckets {
- child.dereference()
- }
- }
- // pageNode returns the in-memory node, if it exists.
- // Otherwise returns the underlying page.
- func (b *Bucket) pageNode(id pgid) (*page, *node) {
- // Inline buckets have a fake page embedded in their value so treat them
- // differently. We'll return the rootNode (if available) or the fake page.
- if b.root == 0 {
- if id != 0 {
- panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id))
- }
- if b.rootNode != nil {
- return nil, b.rootNode
- }
- return b.page, nil
- }
- // Check the node cache for non-inline buckets.
- if b.nodes != nil {
- if n := b.nodes[id]; n != nil {
- return nil, n
- }
- }
- // Finally lookup the page from the transaction if no node is materialized.
- return b.tx.page(id), nil
- }
- // BucketStats records statistics about resources used by a bucket.
- type BucketStats struct {
- // Page count statistics.
- BranchPageN int // number of logical branch pages
- BranchOverflowN int // number of physical branch overflow pages
- LeafPageN int // number of logical leaf pages
- LeafOverflowN int // number of physical leaf overflow pages
- // Tree statistics.
- KeyN int // number of keys/value pairs
- Depth int // number of levels in B+tree
- // Page size utilization.
- BranchAlloc int // bytes allocated for physical branch pages
- BranchInuse int // bytes actually used for branch data
- LeafAlloc int // bytes allocated for physical leaf pages
- LeafInuse int // bytes actually used for leaf data
- // Bucket statistics
- BucketN int // total number of buckets including the top bucket
- InlineBucketN int // total number on inlined buckets
- InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse)
- }
- func (s *BucketStats) Add(other BucketStats) {
- s.BranchPageN += other.BranchPageN
- s.BranchOverflowN += other.BranchOverflowN
- s.LeafPageN += other.LeafPageN
- s.LeafOverflowN += other.LeafOverflowN
- s.KeyN += other.KeyN
- if s.Depth < other.Depth {
- s.Depth = other.Depth
- }
- s.BranchAlloc += other.BranchAlloc
- s.BranchInuse += other.BranchInuse
- s.LeafAlloc += other.LeafAlloc
- s.LeafInuse += other.LeafInuse
- s.BucketN += other.BucketN
- s.InlineBucketN += other.InlineBucketN
- s.InlineBucketInuse += other.InlineBucketInuse
- }
- // cloneBytes returns a copy of a given slice.
- func cloneBytes(v []byte) []byte {
- var clone = make([]byte, len(v))
- copy(clone, v)
- return clone
- }
|