You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							452 lines
						
					
					
						
							10 KiB
						
					
					
				
			
		
		
	
	
							452 lines
						
					
					
						
							10 KiB
						
					
					
				| //  Copyright (c) 2017 Couchbase, Inc.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| // 		http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| package vellum
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"io"
 | |
| )
 | |
| 
 | |
| var defaultBuilderOpts = &BuilderOpts{
 | |
| 	Encoder:           1,
 | |
| 	RegistryTableSize: 10000,
 | |
| 	RegistryMRUSize:   2,
 | |
| }
 | |
| 
 | |
| // A Builder is used to build a new FST.  When possible data is
 | |
| // streamed out to the underlying Writer as soon as possible.
 | |
| type Builder struct {
 | |
| 	unfinished *unfinishedNodes
 | |
| 	registry   *registry
 | |
| 	last       []byte
 | |
| 	len        int
 | |
| 
 | |
| 	lastAddr int
 | |
| 
 | |
| 	encoder encoder
 | |
| 	opts    *BuilderOpts
 | |
| 
 | |
| 	builderNodePool *builderNodePool
 | |
| }
 | |
| 
 | |
| const noneAddr = 1
 | |
| const emptyAddr = 0
 | |
| 
 | |
| // NewBuilder returns a new Builder which will stream out the
 | |
| // underlying representation to the provided Writer as the set is built.
 | |
| func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) {
 | |
| 	if opts == nil {
 | |
| 		opts = defaultBuilderOpts
 | |
| 	}
 | |
| 	builderNodePool := &builderNodePool{}
 | |
| 	rv := &Builder{
 | |
| 		unfinished:      newUnfinishedNodes(builderNodePool),
 | |
| 		registry:        newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize),
 | |
| 		builderNodePool: builderNodePool,
 | |
| 		opts:            opts,
 | |
| 		lastAddr:        noneAddr,
 | |
| 	}
 | |
| 
 | |
| 	var err error
 | |
| 	rv.encoder, err = loadEncoder(opts.Encoder, w)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	err = rv.encoder.start()
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	return rv, nil
 | |
| }
 | |
| 
 | |
| func (b *Builder) Reset(w io.Writer) error {
 | |
| 	b.unfinished.Reset()
 | |
| 	b.registry.Reset()
 | |
| 	b.lastAddr = noneAddr
 | |
| 	b.encoder.reset(w)
 | |
| 	b.last = nil
 | |
| 	b.len = 0
 | |
| 
 | |
| 	err := b.encoder.start()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // Insert the provided value to the set being built.
 | |
| // NOTE: values must be inserted in lexicographical order.
 | |
| func (b *Builder) Insert(key []byte, val uint64) error {
 | |
| 	// ensure items are added in lexicographic order
 | |
| 	if bytes.Compare(key, b.last) < 0 {
 | |
| 		return ErrOutOfOrder
 | |
| 	}
 | |
| 	if len(key) == 0 {
 | |
| 		b.len = 1
 | |
| 		b.unfinished.setRootOutput(val)
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	prefixLen, out := b.unfinished.findCommonPrefixAndSetOutput(key, val)
 | |
| 	b.len++
 | |
| 	err := b.compileFrom(prefixLen)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	b.copyLastKey(key)
 | |
| 	b.unfinished.addSuffix(key[prefixLen:], out)
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (b *Builder) copyLastKey(key []byte) {
 | |
| 	if b.last == nil {
 | |
| 		b.last = make([]byte, 0, 64)
 | |
| 	} else {
 | |
| 		b.last = b.last[:0]
 | |
| 	}
 | |
| 	b.last = append(b.last, key...)
 | |
| }
 | |
| 
 | |
| // Close MUST be called after inserting all values.
 | |
| func (b *Builder) Close() error {
 | |
| 	err := b.compileFrom(0)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	root := b.unfinished.popRoot()
 | |
| 	rootAddr, err := b.compile(root)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	return b.encoder.finish(b.len, rootAddr)
 | |
| }
 | |
| 
 | |
| func (b *Builder) compileFrom(iState int) error {
 | |
| 	addr := noneAddr
 | |
| 	for iState+1 < len(b.unfinished.stack) {
 | |
| 		var node *builderNode
 | |
| 		if addr == noneAddr {
 | |
| 			node = b.unfinished.popEmpty()
 | |
| 		} else {
 | |
| 			node = b.unfinished.popFreeze(addr)
 | |
| 		}
 | |
| 		var err error
 | |
| 		addr, err = b.compile(node)
 | |
| 		if err != nil {
 | |
| 			return nil
 | |
| 		}
 | |
| 	}
 | |
| 	b.unfinished.topLastFreeze(addr)
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (b *Builder) compile(node *builderNode) (int, error) {
 | |
| 	if node.final && len(node.trans) == 0 &&
 | |
| 		node.finalOutput == 0 {
 | |
| 		return 0, nil
 | |
| 	}
 | |
| 	found, addr, entry := b.registry.entry(node)
 | |
| 	if found {
 | |
| 		return addr, nil
 | |
| 	}
 | |
| 	addr, err := b.encoder.encodeState(node, b.lastAddr)
 | |
| 	if err != nil {
 | |
| 		return 0, err
 | |
| 	}
 | |
| 
 | |
| 	b.lastAddr = addr
 | |
| 	entry.addr = addr
 | |
| 	return addr, nil
 | |
| }
 | |
| 
 | |
| type unfinishedNodes struct {
 | |
| 	stack []*builderNodeUnfinished
 | |
| 
 | |
| 	// cache allocates a reasonable number of builderNodeUnfinished
 | |
| 	// objects up front and tries to keep reusing them
 | |
| 	// because the main data structure is a stack, we assume the
 | |
| 	// same access pattern, and don't track items separately
 | |
| 	// this means calls get() and pushXYZ() must be paired,
 | |
| 	// as well as calls put() and popXYZ()
 | |
| 	cache []builderNodeUnfinished
 | |
| 
 | |
| 	builderNodePool *builderNodePool
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) Reset() {
 | |
| 	u.stack = u.stack[:0]
 | |
| 	for i := 0; i < len(u.cache); i++ {
 | |
| 		u.cache[i] = builderNodeUnfinished{}
 | |
| 	}
 | |
| 	u.pushEmpty(false)
 | |
| }
 | |
| 
 | |
| func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes {
 | |
| 	rv := &unfinishedNodes{
 | |
| 		stack:           make([]*builderNodeUnfinished, 0, 64),
 | |
| 		cache:           make([]builderNodeUnfinished, 64),
 | |
| 		builderNodePool: p,
 | |
| 	}
 | |
| 	rv.pushEmpty(false)
 | |
| 	return rv
 | |
| }
 | |
| 
 | |
| // get new builderNodeUnfinished, reusing cache if possible
 | |
| func (u *unfinishedNodes) get() *builderNodeUnfinished {
 | |
| 	if len(u.stack) < len(u.cache) {
 | |
| 		return &u.cache[len(u.stack)]
 | |
| 	}
 | |
| 	// full now allocate a new one
 | |
| 	return &builderNodeUnfinished{}
 | |
| }
 | |
| 
 | |
| // return builderNodeUnfinished, clearing it for reuse
 | |
| func (u *unfinishedNodes) put() {
 | |
| 	if len(u.stack) >= len(u.cache) {
 | |
| 		return
 | |
| 		// do nothing, not part of cache
 | |
| 	}
 | |
| 	u.cache[len(u.stack)] = builderNodeUnfinished{}
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte,
 | |
| 	out uint64) (int, uint64) {
 | |
| 	var i int
 | |
| 	for i < len(key) {
 | |
| 		if i >= len(u.stack) {
 | |
| 			break
 | |
| 		}
 | |
| 		var addPrefix uint64
 | |
| 		if !u.stack[i].hasLastT {
 | |
| 			break
 | |
| 		}
 | |
| 		if u.stack[i].lastIn == key[i] {
 | |
| 			commonPre := outputPrefix(u.stack[i].lastOut, out)
 | |
| 			addPrefix = outputSub(u.stack[i].lastOut, commonPre)
 | |
| 			out = outputSub(out, commonPre)
 | |
| 			u.stack[i].lastOut = commonPre
 | |
| 			i++
 | |
| 		} else {
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		if addPrefix != 0 {
 | |
| 			u.stack[i].addOutputPrefix(addPrefix)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return i, out
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) pushEmpty(final bool) {
 | |
| 	next := u.get()
 | |
| 	next.node = u.builderNodePool.Get()
 | |
| 	next.node.final = final
 | |
| 	u.stack = append(u.stack, next)
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) popRoot() *builderNode {
 | |
| 	l := len(u.stack)
 | |
| 	var unfinished *builderNodeUnfinished
 | |
| 	u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
 | |
| 	rv := unfinished.node
 | |
| 	u.put()
 | |
| 	return rv
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) popFreeze(addr int) *builderNode {
 | |
| 	l := len(u.stack)
 | |
| 	var unfinished *builderNodeUnfinished
 | |
| 	u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
 | |
| 	unfinished.lastCompiled(addr)
 | |
| 	rv := unfinished.node
 | |
| 	u.put()
 | |
| 	return rv
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) popEmpty() *builderNode {
 | |
| 	l := len(u.stack)
 | |
| 	var unfinished *builderNodeUnfinished
 | |
| 	u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
 | |
| 	rv := unfinished.node
 | |
| 	u.put()
 | |
| 	return rv
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) setRootOutput(out uint64) {
 | |
| 	u.stack[0].node.final = true
 | |
| 	u.stack[0].node.finalOutput = out
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) topLastFreeze(addr int) {
 | |
| 	last := len(u.stack) - 1
 | |
| 	u.stack[last].lastCompiled(addr)
 | |
| }
 | |
| 
 | |
| func (u *unfinishedNodes) addSuffix(bs []byte, out uint64) {
 | |
| 	if len(bs) == 0 {
 | |
| 		return
 | |
| 	}
 | |
| 	last := len(u.stack) - 1
 | |
| 	u.stack[last].hasLastT = true
 | |
| 	u.stack[last].lastIn = bs[0]
 | |
| 	u.stack[last].lastOut = out
 | |
| 	for _, b := range bs[1:] {
 | |
| 		next := u.get()
 | |
| 		next.node = u.builderNodePool.Get()
 | |
| 		next.hasLastT = true
 | |
| 		next.lastIn = b
 | |
| 		next.lastOut = 0
 | |
| 		u.stack = append(u.stack, next)
 | |
| 	}
 | |
| 	u.pushEmpty(true)
 | |
| }
 | |
| 
 | |
| type builderNodeUnfinished struct {
 | |
| 	node     *builderNode
 | |
| 	lastOut  uint64
 | |
| 	lastIn   byte
 | |
| 	hasLastT bool
 | |
| }
 | |
| 
 | |
| func (b *builderNodeUnfinished) lastCompiled(addr int) {
 | |
| 	if b.hasLastT {
 | |
| 		transIn := b.lastIn
 | |
| 		transOut := b.lastOut
 | |
| 		b.hasLastT = false
 | |
| 		b.lastOut = 0
 | |
| 		b.node.trans = append(b.node.trans, transition{
 | |
| 			in:   transIn,
 | |
| 			out:  transOut,
 | |
| 			addr: addr,
 | |
| 		})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) {
 | |
| 	if b.node.final {
 | |
| 		b.node.finalOutput = outputCat(prefix, b.node.finalOutput)
 | |
| 	}
 | |
| 	for i := range b.node.trans {
 | |
| 		b.node.trans[i].out = outputCat(prefix, b.node.trans[i].out)
 | |
| 	}
 | |
| 	if b.hasLastT {
 | |
| 		b.lastOut = outputCat(prefix, b.lastOut)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| type builderNode struct {
 | |
| 	finalOutput uint64
 | |
| 	trans       []transition
 | |
| 	final       bool
 | |
| 
 | |
| 	// intrusive linked list
 | |
| 	next *builderNode
 | |
| }
 | |
| 
 | |
| // reset resets the receiver builderNode to a re-usable state.
 | |
| func (n *builderNode) reset() {
 | |
| 	n.final = false
 | |
| 	n.finalOutput = 0
 | |
| 	for i := range n.trans {
 | |
| 		n.trans[i] = emptyTransition
 | |
| 	}
 | |
| 	n.trans = n.trans[:0]
 | |
| 	n.next = nil
 | |
| }
 | |
| 
 | |
| func (n *builderNode) equiv(o *builderNode) bool {
 | |
| 	if n.final != o.final {
 | |
| 		return false
 | |
| 	}
 | |
| 	if n.finalOutput != o.finalOutput {
 | |
| 		return false
 | |
| 	}
 | |
| 	if len(n.trans) != len(o.trans) {
 | |
| 		return false
 | |
| 	}
 | |
| 	for i, ntrans := range n.trans {
 | |
| 		otrans := o.trans[i]
 | |
| 		if ntrans.in != otrans.in {
 | |
| 			return false
 | |
| 		}
 | |
| 		if ntrans.addr != otrans.addr {
 | |
| 			return false
 | |
| 		}
 | |
| 		if ntrans.out != otrans.out {
 | |
| 			return false
 | |
| 		}
 | |
| 	}
 | |
| 	return true
 | |
| }
 | |
| 
 | |
| var emptyTransition = transition{}
 | |
| 
 | |
| type transition struct {
 | |
| 	out  uint64
 | |
| 	addr int
 | |
| 	in   byte
 | |
| }
 | |
| 
 | |
| func outputPrefix(l, r uint64) uint64 {
 | |
| 	if l < r {
 | |
| 		return l
 | |
| 	}
 | |
| 	return r
 | |
| }
 | |
| 
 | |
| func outputSub(l, r uint64) uint64 {
 | |
| 	return l - r
 | |
| }
 | |
| 
 | |
| func outputCat(l, r uint64) uint64 {
 | |
| 	return l + r
 | |
| }
 | |
| 
 | |
| // builderNodePool pools builderNodes using a singly linked list.
 | |
| //
 | |
| // NB: builderNode lifecylce is described by the following interactions -
 | |
| // +------------------------+                            +----------------------+
 | |
| // |    Unfinished Nodes    |      Transfer once         |        Registry      |
 | |
| // |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) |
 | |
| // +------------------------+      marked frozen         +----------------------+
 | |
| //              ^                                                     |
 | |
| //              |                                                     |
 | |
| //              |                                                   Put()
 | |
| //              | Get() on        +-------------------+             when
 | |
| //              +-new char--------| builderNode Pool  |<-----------evicted
 | |
| //                                +-------------------+
 | |
| type builderNodePool struct {
 | |
| 	head *builderNode
 | |
| }
 | |
| 
 | |
| func (p *builderNodePool) Get() *builderNode {
 | |
| 	if p.head == nil {
 | |
| 		return &builderNode{}
 | |
| 	}
 | |
| 	head := p.head
 | |
| 	p.head = p.head.next
 | |
| 	return head
 | |
| }
 | |
| 
 | |
| func (p *builderNodePool) Put(v *builderNode) {
 | |
| 	if v == nil {
 | |
| 		return
 | |
| 	}
 | |
| 	v.reset()
 | |
| 	v.next = p.head
 | |
| 	p.head = v
 | |
| }
 | |
| 
 |