You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
350 lines
8.4 KiB
350 lines
8.4 KiB
6 years ago
|
// Copyright (c) 2018 Couchbase, Inc.
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
5 years ago
|
package levenshtein
|
||
6 years ago
|
|
||
|
import (
|
||
|
"crypto/md5"
|
||
|
"encoding/json"
|
||
|
"fmt"
|
||
|
"math"
|
||
|
)
|
||
|
|
||
|
type ParametricState struct {
|
||
|
shapeID uint32
|
||
|
offset uint32
|
||
|
}
|
||
|
|
||
|
func newParametricState() ParametricState {
|
||
|
return ParametricState{}
|
||
|
}
|
||
|
|
||
|
func (ps *ParametricState) isDeadEnd() bool {
|
||
|
return ps.shapeID == 0
|
||
|
}
|
||
|
|
||
|
type Transition struct {
|
||
|
destShapeID uint32
|
||
|
deltaOffset uint32
|
||
|
}
|
||
|
|
||
|
func (t *Transition) apply(state ParametricState) ParametricState {
|
||
|
ps := ParametricState{
|
||
|
shapeID: t.destShapeID}
|
||
|
// don't need any offset if we are in the dead state,
|
||
|
// this ensures we have only one dead state.
|
||
|
if t.destShapeID != 0 {
|
||
|
ps.offset = state.offset + t.deltaOffset
|
||
|
}
|
||
|
|
||
|
return ps
|
||
|
}
|
||
|
|
||
|
type ParametricStateIndex struct {
|
||
|
stateIndex []uint32
|
||
|
stateQueue []ParametricState
|
||
|
numOffsets uint32
|
||
|
}
|
||
|
|
||
|
func newParametricStateIndex(queryLen,
|
||
|
numParamState uint32) ParametricStateIndex {
|
||
|
numOffsets := queryLen + 1
|
||
|
if numParamState == 0 {
|
||
|
numParamState = numOffsets
|
||
|
}
|
||
|
maxNumStates := numParamState * numOffsets
|
||
|
psi := ParametricStateIndex{
|
||
|
stateIndex: make([]uint32, maxNumStates),
|
||
|
stateQueue: make([]ParametricState, 0, 150),
|
||
|
numOffsets: numOffsets,
|
||
|
}
|
||
|
|
||
|
for i := uint32(0); i < maxNumStates; i++ {
|
||
|
psi.stateIndex[i] = math.MaxUint32
|
||
|
}
|
||
|
return psi
|
||
|
}
|
||
|
|
||
|
func (psi *ParametricStateIndex) numStates() int {
|
||
|
return len(psi.stateQueue)
|
||
|
}
|
||
|
|
||
|
func (psi *ParametricStateIndex) maxNumStates() int {
|
||
|
return len(psi.stateIndex)
|
||
|
}
|
||
|
|
||
|
func (psi *ParametricStateIndex) get(stateID uint32) ParametricState {
|
||
|
return psi.stateQueue[stateID]
|
||
|
}
|
||
|
|
||
|
func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 {
|
||
|
bucket := ps.shapeID*psi.numOffsets + ps.offset
|
||
|
if bucket < uint32(len(psi.stateIndex)) &&
|
||
|
psi.stateIndex[bucket] != math.MaxUint32 {
|
||
|
return psi.stateIndex[bucket]
|
||
|
}
|
||
|
nState := uint32(len(psi.stateQueue))
|
||
|
psi.stateQueue = append(psi.stateQueue, ps)
|
||
|
|
||
|
psi.stateIndex[bucket] = nState
|
||
|
return nState
|
||
|
}
|
||
|
|
||
|
type ParametricDFA struct {
|
||
|
distance []uint8
|
||
|
transitions []Transition
|
||
|
maxDistance uint8
|
||
|
transitionStride uint32
|
||
|
diameter uint32
|
||
|
}
|
||
|
|
||
|
func (pdfa *ParametricDFA) initialState() ParametricState {
|
||
|
return ParametricState{shapeID: 1}
|
||
|
}
|
||
|
|
||
|
// Returns true iff whatever characters come afterward,
|
||
|
// we will never reach a shorter distance
|
||
|
func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool {
|
||
|
if state.isDeadEnd() {
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
remOffset := queryLen - state.offset
|
||
|
if remOffset < pdfa.diameter {
|
||
|
stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:]
|
||
|
prefixDistance := stateDistances[remOffset]
|
||
|
if prefixDistance > pdfa.maxDistance {
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
for _, d := range stateDistances {
|
||
|
if d < prefixDistance {
|
||
|
return false
|
||
|
}
|
||
|
}
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (pdfa *ParametricDFA) numStates() int {
|
||
|
return len(pdfa.transitions) / int(pdfa.transitionStride)
|
||
|
}
|
||
|
|
||
|
func min(x, y uint32) uint32 {
|
||
|
if x < y {
|
||
|
return x
|
||
|
}
|
||
|
return y
|
||
|
}
|
||
|
|
||
|
func (pdfa *ParametricDFA) transition(state ParametricState,
|
||
|
chi uint32) Transition {
|
||
|
return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi]
|
||
|
}
|
||
|
|
||
|
func (pdfa *ParametricDFA) getDistance(state ParametricState,
|
||
|
qLen uint32) Distance {
|
||
|
remainingOffset := qLen - state.offset
|
||
|
if state.isDeadEnd() || remainingOffset >= pdfa.diameter {
|
||
|
return Atleast{d: pdfa.maxDistance + 1}
|
||
|
}
|
||
|
dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)]
|
||
|
if dist > pdfa.maxDistance {
|
||
|
return Atleast{d: dist}
|
||
|
}
|
||
|
return Exact{d: dist}
|
||
|
}
|
||
|
|
||
|
func (pdfa *ParametricDFA) computeDistance(left, right string) Distance {
|
||
|
state := pdfa.initialState()
|
||
|
leftChars := []rune(left)
|
||
|
for _, chr := range []rune(right) {
|
||
|
start := state.offset
|
||
|
stop := min(start+pdfa.diameter, uint32(len(leftChars)))
|
||
|
chi := characteristicVector(leftChars[start:stop], chr)
|
||
|
transition := pdfa.transition(state, uint32(chi))
|
||
|
state = transition.apply(state)
|
||
|
if state.isDeadEnd() {
|
||
|
return Atleast{d: pdfa.maxDistance + 1}
|
||
|
}
|
||
|
}
|
||
|
return pdfa.getDistance(state, uint32(len(left)))
|
||
|
}
|
||
|
|
||
|
func (pdfa *ParametricDFA) buildDfa(query string, distance uint8,
|
||
|
prefix bool) (*DFA, error) {
|
||
|
qLen := uint32(len([]rune(query)))
|
||
|
alphabet := queryChars(query)
|
||
|
|
||
|
psi := newParametricStateIndex(qLen, uint32(pdfa.numStates()))
|
||
|
maxNumStates := psi.maxNumStates()
|
||
|
deadEndStateID := psi.getOrAllocate(newParametricState())
|
||
|
if deadEndStateID != 0 {
|
||
|
return nil, fmt.Errorf("Invalid dead end state")
|
||
|
}
|
||
|
|
||
|
initialStateID := psi.getOrAllocate(pdfa.initialState())
|
||
|
dfaBuilder := withMaxStates(uint32(maxNumStates))
|
||
|
mask := uint32((1 << pdfa.diameter) - 1)
|
||
|
|
||
|
var stateID int
|
||
|
for stateID = 0; stateID < StateLimit; stateID++ {
|
||
|
if stateID == psi.numStates() {
|
||
|
break
|
||
|
}
|
||
|
state := psi.get(uint32(stateID))
|
||
|
if prefix && pdfa.isPrefixSink(state, qLen) {
|
||
|
distance := pdfa.getDistance(state, qLen)
|
||
|
dfaBuilder.addState(uint32(stateID), uint32(stateID), distance)
|
||
|
} else {
|
||
|
transition := pdfa.transition(state, 0)
|
||
|
defSuccessor := transition.apply(state)
|
||
|
defSuccessorID := psi.getOrAllocate(defSuccessor)
|
||
|
distance := pdfa.getDistance(state, qLen)
|
||
|
stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance)
|
||
|
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err)
|
||
|
}
|
||
|
|
||
|
alphabet.resetNext()
|
||
|
chr, cv, err := alphabet.next()
|
||
|
for err == nil {
|
||
|
chi := cv.shiftAndMask(state.offset, mask)
|
||
|
|
||
|
transition := pdfa.transition(state, chi)
|
||
|
|
||
|
destState := transition.apply(state)
|
||
|
|
||
|
destStateID := psi.getOrAllocate(destState)
|
||
|
|
||
|
stateBuilder.addTransition(chr, destStateID)
|
||
|
|
||
|
chr, cv, err = alphabet.next()
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if stateID == StateLimit {
|
||
|
return nil, ErrTooManyStates
|
||
|
}
|
||
|
|
||
|
dfaBuilder.setInitialState(initialStateID)
|
||
|
return dfaBuilder.build(distance), nil
|
||
|
}
|
||
|
|
||
|
func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) {
|
||
|
lookUp := newHash()
|
||
|
lookUp.getOrAllocate(*newMultiState())
|
||
|
initialState := nfa.initialStates()
|
||
|
lookUp.getOrAllocate(*initialState)
|
||
|
|
||
|
maxDistance := nfa.maxDistance()
|
||
|
msDiameter := nfa.msDiameter()
|
||
|
|
||
|
numChi := 1 << msDiameter
|
||
|
chiValues := make([]uint64, numChi)
|
||
|
for i := 0; i < numChi; i++ {
|
||
|
chiValues[i] = uint64(i)
|
||
|
}
|
||
|
|
||
|
transitions := make([]Transition, 0, numChi*int(msDiameter))
|
||
|
var stateID int
|
||
|
for stateID = 0; stateID < StateLimit; stateID++ {
|
||
|
if stateID == len(lookUp.items) {
|
||
|
break
|
||
|
}
|
||
|
|
||
|
for _, chi := range chiValues {
|
||
|
destMs := newMultiState()
|
||
|
|
||
|
ms := lookUp.getFromID(stateID)
|
||
|
|
||
|
nfa.transition(ms, destMs, chi)
|
||
|
|
||
|
translation := destMs.normalize()
|
||
|
|
||
|
destID := lookUp.getOrAllocate(*destMs)
|
||
|
|
||
|
transitions = append(transitions, Transition{
|
||
|
destShapeID: uint32(destID),
|
||
|
deltaOffset: translation,
|
||
|
})
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if stateID == StateLimit {
|
||
|
return nil, ErrTooManyStates
|
||
|
}
|
||
|
|
||
|
ns := len(lookUp.items)
|
||
|
diameter := int(msDiameter)
|
||
|
|
||
|
distances := make([]uint8, 0, diameter*ns)
|
||
|
for stateID := 0; stateID < ns; stateID++ {
|
||
|
ms := lookUp.getFromID(stateID)
|
||
|
for offset := 0; offset < diameter; offset++ {
|
||
|
dist := nfa.multistateDistance(ms, uint32(offset))
|
||
|
distances = append(distances, dist.distance())
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return &ParametricDFA{
|
||
|
diameter: uint32(msDiameter),
|
||
|
transitions: transitions,
|
||
|
maxDistance: maxDistance,
|
||
|
transitionStride: uint32(numChi),
|
||
|
distance: distances,
|
||
|
}, nil
|
||
|
}
|
||
|
|
||
|
type hash struct {
|
||
|
index map[[16]byte]int
|
||
|
items []MultiState
|
||
|
}
|
||
|
|
||
|
func newHash() *hash {
|
||
|
return &hash{
|
||
|
index: make(map[[16]byte]int, 100),
|
||
|
items: make([]MultiState, 0, 100),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (h *hash) getOrAllocate(m MultiState) int {
|
||
|
size := len(h.items)
|
||
|
var exists bool
|
||
|
var pos int
|
||
|
md5 := getHash(&m)
|
||
|
if pos, exists = h.index[md5]; !exists {
|
||
|
h.index[md5] = size
|
||
|
pos = size
|
||
|
h.items = append(h.items, m)
|
||
|
}
|
||
|
return pos
|
||
|
}
|
||
|
|
||
|
func (h *hash) getFromID(id int) *MultiState {
|
||
|
return &h.items[id]
|
||
|
}
|
||
|
|
||
|
func getHash(ms *MultiState) [16]byte {
|
||
|
msBytes := []byte{}
|
||
|
for _, state := range ms.states {
|
||
|
jsonBytes, _ := json.Marshal(&state)
|
||
|
msBytes = append(msBytes, jsonBytes...)
|
||
|
}
|
||
|
return md5.Sum(msBytes)
|
||
|
}
|