You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							349 lines
						
					
					
						
							8.4 KiB
						
					
					
				
			
		
		
	
	
							349 lines
						
					
					
						
							8.4 KiB
						
					
					
				| //  Copyright (c) 2018 Couchbase, Inc.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| // 		http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| package levenshtein2
 | |
| 
 | |
| import (
 | |
| 	"crypto/md5"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"math"
 | |
| )
 | |
| 
 | |
| type ParametricState struct {
 | |
| 	shapeID uint32
 | |
| 	offset  uint32
 | |
| }
 | |
| 
 | |
| func newParametricState() ParametricState {
 | |
| 	return ParametricState{}
 | |
| }
 | |
| 
 | |
| func (ps *ParametricState) isDeadEnd() bool {
 | |
| 	return ps.shapeID == 0
 | |
| }
 | |
| 
 | |
| type Transition struct {
 | |
| 	destShapeID uint32
 | |
| 	deltaOffset uint32
 | |
| }
 | |
| 
 | |
| func (t *Transition) apply(state ParametricState) ParametricState {
 | |
| 	ps := ParametricState{
 | |
| 		shapeID: t.destShapeID}
 | |
| 	// don't need any offset if we are in the dead state,
 | |
| 	// this ensures we have only one dead state.
 | |
| 	if t.destShapeID != 0 {
 | |
| 		ps.offset = state.offset + t.deltaOffset
 | |
| 	}
 | |
| 
 | |
| 	return ps
 | |
| }
 | |
| 
 | |
| type ParametricStateIndex struct {
 | |
| 	stateIndex []uint32
 | |
| 	stateQueue []ParametricState
 | |
| 	numOffsets uint32
 | |
| }
 | |
| 
 | |
| func newParametricStateIndex(queryLen,
 | |
| 	numParamState uint32) ParametricStateIndex {
 | |
| 	numOffsets := queryLen + 1
 | |
| 	if numParamState == 0 {
 | |
| 		numParamState = numOffsets
 | |
| 	}
 | |
| 	maxNumStates := numParamState * numOffsets
 | |
| 	psi := ParametricStateIndex{
 | |
| 		stateIndex: make([]uint32, maxNumStates),
 | |
| 		stateQueue: make([]ParametricState, 0, 150),
 | |
| 		numOffsets: numOffsets,
 | |
| 	}
 | |
| 
 | |
| 	for i := uint32(0); i < maxNumStates; i++ {
 | |
| 		psi.stateIndex[i] = math.MaxUint32
 | |
| 	}
 | |
| 	return psi
 | |
| }
 | |
| 
 | |
| func (psi *ParametricStateIndex) numStates() int {
 | |
| 	return len(psi.stateQueue)
 | |
| }
 | |
| 
 | |
| func (psi *ParametricStateIndex) maxNumStates() int {
 | |
| 	return len(psi.stateIndex)
 | |
| }
 | |
| 
 | |
| func (psi *ParametricStateIndex) get(stateID uint32) ParametricState {
 | |
| 	return psi.stateQueue[stateID]
 | |
| }
 | |
| 
 | |
| func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 {
 | |
| 	bucket := ps.shapeID*psi.numOffsets + ps.offset
 | |
| 	if bucket < uint32(len(psi.stateIndex)) &&
 | |
| 		psi.stateIndex[bucket] != math.MaxUint32 {
 | |
| 		return psi.stateIndex[bucket]
 | |
| 	}
 | |
| 	nState := uint32(len(psi.stateQueue))
 | |
| 	psi.stateQueue = append(psi.stateQueue, ps)
 | |
| 
 | |
| 	psi.stateIndex[bucket] = nState
 | |
| 	return nState
 | |
| }
 | |
| 
 | |
| type ParametricDFA struct {
 | |
| 	distance         []uint8
 | |
| 	transitions      []Transition
 | |
| 	maxDistance      uint8
 | |
| 	transitionStride uint32
 | |
| 	diameter         uint32
 | |
| }
 | |
| 
 | |
| func (pdfa *ParametricDFA) initialState() ParametricState {
 | |
| 	return ParametricState{shapeID: 1}
 | |
| }
 | |
| 
 | |
| // Returns true iff whatever characters come afterward,
 | |
| // we will never reach a shorter distance
 | |
| func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool {
 | |
| 	if state.isDeadEnd() {
 | |
| 		return true
 | |
| 	}
 | |
| 
 | |
| 	remOffset := queryLen - state.offset
 | |
| 	if remOffset < pdfa.diameter {
 | |
| 		stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:]
 | |
| 		prefixDistance := stateDistances[remOffset]
 | |
| 		if prefixDistance > pdfa.maxDistance {
 | |
| 			return false
 | |
| 		}
 | |
| 
 | |
| 		for _, d := range stateDistances {
 | |
| 			if d < prefixDistance {
 | |
| 				return false
 | |
| 			}
 | |
| 		}
 | |
| 		return true
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| func (pdfa *ParametricDFA) numStates() int {
 | |
| 	return len(pdfa.transitions) / int(pdfa.transitionStride)
 | |
| }
 | |
| 
 | |
| func min(x, y uint32) uint32 {
 | |
| 	if x < y {
 | |
| 		return x
 | |
| 	}
 | |
| 	return y
 | |
| }
 | |
| 
 | |
| func (pdfa *ParametricDFA) transition(state ParametricState,
 | |
| 	chi uint32) Transition {
 | |
| 	return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi]
 | |
| }
 | |
| 
 | |
| func (pdfa *ParametricDFA) getDistance(state ParametricState,
 | |
| 	qLen uint32) Distance {
 | |
| 	remainingOffset := qLen - state.offset
 | |
| 	if state.isDeadEnd() || remainingOffset >= pdfa.diameter {
 | |
| 		return Atleast{d: pdfa.maxDistance + 1}
 | |
| 	}
 | |
| 	dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)]
 | |
| 	if dist > pdfa.maxDistance {
 | |
| 		return Atleast{d: dist}
 | |
| 	}
 | |
| 	return Exact{d: dist}
 | |
| }
 | |
| 
 | |
| func (pdfa *ParametricDFA) computeDistance(left, right string) Distance {
 | |
| 	state := pdfa.initialState()
 | |
| 	leftChars := []rune(left)
 | |
| 	for _, chr := range []rune(right) {
 | |
| 		start := state.offset
 | |
| 		stop := min(start+pdfa.diameter, uint32(len(leftChars)))
 | |
| 		chi := characteristicVector(leftChars[start:stop], chr)
 | |
| 		transition := pdfa.transition(state, uint32(chi))
 | |
| 		state = transition.apply(state)
 | |
| 		if state.isDeadEnd() {
 | |
| 			return Atleast{d: pdfa.maxDistance + 1}
 | |
| 		}
 | |
| 	}
 | |
| 	return pdfa.getDistance(state, uint32(len(left)))
 | |
| }
 | |
| 
 | |
| func (pdfa *ParametricDFA) buildDfa(query string, distance uint8,
 | |
| 	prefix bool) (*DFA, error) {
 | |
| 	qLen := uint32(len([]rune(query)))
 | |
| 	alphabet := queryChars(query)
 | |
| 
 | |
| 	psi := newParametricStateIndex(qLen, uint32(pdfa.numStates()))
 | |
| 	maxNumStates := psi.maxNumStates()
 | |
| 	deadEndStateID := psi.getOrAllocate(newParametricState())
 | |
| 	if deadEndStateID != 0 {
 | |
| 		return nil, fmt.Errorf("Invalid dead end state")
 | |
| 	}
 | |
| 
 | |
| 	initialStateID := psi.getOrAllocate(pdfa.initialState())
 | |
| 	dfaBuilder := withMaxStates(uint32(maxNumStates))
 | |
| 	mask := uint32((1 << pdfa.diameter) - 1)
 | |
| 
 | |
| 	var stateID int
 | |
| 	for stateID = 0; stateID < StateLimit; stateID++ {
 | |
| 		if stateID == psi.numStates() {
 | |
| 			break
 | |
| 		}
 | |
| 		state := psi.get(uint32(stateID))
 | |
| 		if prefix && pdfa.isPrefixSink(state, qLen) {
 | |
| 			distance := pdfa.getDistance(state, qLen)
 | |
| 			dfaBuilder.addState(uint32(stateID), uint32(stateID), distance)
 | |
| 		} else {
 | |
| 			transition := pdfa.transition(state, 0)
 | |
| 			defSuccessor := transition.apply(state)
 | |
| 			defSuccessorID := psi.getOrAllocate(defSuccessor)
 | |
| 			distance := pdfa.getDistance(state, qLen)
 | |
| 			stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance)
 | |
| 
 | |
| 			if err != nil {
 | |
| 				return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err)
 | |
| 			}
 | |
| 
 | |
| 			alphabet.resetNext()
 | |
| 			chr, cv, err := alphabet.next()
 | |
| 			for err == nil {
 | |
| 				chi := cv.shiftAndMask(state.offset, mask)
 | |
| 
 | |
| 				transition := pdfa.transition(state, chi)
 | |
| 
 | |
| 				destState := transition.apply(state)
 | |
| 
 | |
| 				destStateID := psi.getOrAllocate(destState)
 | |
| 
 | |
| 				stateBuilder.addTransition(chr, destStateID)
 | |
| 
 | |
| 				chr, cv, err = alphabet.next()
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if stateID == StateLimit {
 | |
| 		return nil, ErrTooManyStates
 | |
| 	}
 | |
| 
 | |
| 	dfaBuilder.setInitialState(initialStateID)
 | |
| 	return dfaBuilder.build(distance), nil
 | |
| }
 | |
| 
 | |
| func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) {
 | |
| 	lookUp := newHash()
 | |
| 	lookUp.getOrAllocate(*newMultiState())
 | |
| 	initialState := nfa.initialStates()
 | |
| 	lookUp.getOrAllocate(*initialState)
 | |
| 
 | |
| 	maxDistance := nfa.maxDistance()
 | |
| 	msDiameter := nfa.msDiameter()
 | |
| 
 | |
| 	numChi := 1 << msDiameter
 | |
| 	chiValues := make([]uint64, numChi)
 | |
| 	for i := 0; i < numChi; i++ {
 | |
| 		chiValues[i] = uint64(i)
 | |
| 	}
 | |
| 
 | |
| 	transitions := make([]Transition, 0, numChi*int(msDiameter))
 | |
| 	var stateID int
 | |
| 	for stateID = 0; stateID < StateLimit; stateID++ {
 | |
| 		if stateID == len(lookUp.items) {
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		for _, chi := range chiValues {
 | |
| 			destMs := newMultiState()
 | |
| 
 | |
| 			ms := lookUp.getFromID(stateID)
 | |
| 
 | |
| 			nfa.transition(ms, destMs, chi)
 | |
| 
 | |
| 			translation := destMs.normalize()
 | |
| 
 | |
| 			destID := lookUp.getOrAllocate(*destMs)
 | |
| 
 | |
| 			transitions = append(transitions, Transition{
 | |
| 				destShapeID: uint32(destID),
 | |
| 				deltaOffset: translation,
 | |
| 			})
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if stateID == StateLimit {
 | |
| 		return nil, ErrTooManyStates
 | |
| 	}
 | |
| 
 | |
| 	ns := len(lookUp.items)
 | |
| 	diameter := int(msDiameter)
 | |
| 
 | |
| 	distances := make([]uint8, 0, diameter*ns)
 | |
| 	for stateID := 0; stateID < ns; stateID++ {
 | |
| 		ms := lookUp.getFromID(stateID)
 | |
| 		for offset := 0; offset < diameter; offset++ {
 | |
| 			dist := nfa.multistateDistance(ms, uint32(offset))
 | |
| 			distances = append(distances, dist.distance())
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return &ParametricDFA{
 | |
| 		diameter:         uint32(msDiameter),
 | |
| 		transitions:      transitions,
 | |
| 		maxDistance:      maxDistance,
 | |
| 		transitionStride: uint32(numChi),
 | |
| 		distance:         distances,
 | |
| 	}, nil
 | |
| }
 | |
| 
 | |
| type hash struct {
 | |
| 	index map[[16]byte]int
 | |
| 	items []MultiState
 | |
| }
 | |
| 
 | |
| func newHash() *hash {
 | |
| 	return &hash{
 | |
| 		index: make(map[[16]byte]int, 100),
 | |
| 		items: make([]MultiState, 0, 100),
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (h *hash) getOrAllocate(m MultiState) int {
 | |
| 	size := len(h.items)
 | |
| 	var exists bool
 | |
| 	var pos int
 | |
| 	md5 := getHash(&m)
 | |
| 	if pos, exists = h.index[md5]; !exists {
 | |
| 		h.index[md5] = size
 | |
| 		pos = size
 | |
| 		h.items = append(h.items, m)
 | |
| 	}
 | |
| 	return pos
 | |
| }
 | |
| 
 | |
| func (h *hash) getFromID(id int) *MultiState {
 | |
| 	return &h.items[id]
 | |
| }
 | |
| 
 | |
| func getHash(ms *MultiState) [16]byte {
 | |
| 	msBytes := []byte{}
 | |
| 	for _, state := range ms.states {
 | |
| 		jsonBytes, _ := json.Marshal(&state)
 | |
| 		msBytes = append(msBytes, jsonBytes...)
 | |
| 	}
 | |
| 	return md5.Sum(msBytes)
 | |
| }
 | |
| 
 |