Update bleve dependency to latest master revision (#6100)
* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revisiontokarchuk/v1.17
parent
11e316654e
commit
a380cfd8e0
@ -1,22 +0,0 @@ |
||||
The MIT License (MIT) |
||||
|
||||
Copyright (c) 2015 Stephen Merity |
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
of this software and associated documentation files (the "Software"), to deal |
||||
in the Software without restriction, including without limitation the rights |
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
copies of the Software, and to permit persons to whom the Software is |
||||
furnished to do so, subject to the following conditions: |
||||
|
||||
The above copyright notice and this permission notice shall be included in all |
||||
copies or substantial portions of the Software. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
SOFTWARE. |
||||
|
@ -1,229 +0,0 @@ |
||||
package govarint |
||||
|
||||
import "encoding/binary" |
||||
import "io" |
||||
|
||||
type U32VarintEncoder interface { |
||||
PutU32(x uint32) int |
||||
Close() |
||||
} |
||||
|
||||
type U32VarintDecoder interface { |
||||
GetU32() (uint32, error) |
||||
} |
||||
|
||||
///
|
||||
|
||||
type U64VarintEncoder interface { |
||||
PutU64(x uint64) int |
||||
Close() |
||||
} |
||||
|
||||
type U64VarintDecoder interface { |
||||
GetU64() (uint64, error) |
||||
} |
||||
|
||||
///
|
||||
|
||||
type U32GroupVarintEncoder struct { |
||||
w io.Writer |
||||
index int |
||||
store [4]uint32 |
||||
temp [17]byte |
||||
} |
||||
|
||||
func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} } |
||||
|
||||
func (b *U32GroupVarintEncoder) Flush() (int, error) { |
||||
// TODO: Is it more efficient to have a tailored version that's called only in Close()?
|
||||
// If index is zero, there are no integers to flush
|
||||
if b.index == 0 { |
||||
return 0, nil |
||||
} |
||||
// In the case we're flushing (the group isn't of size four), the non-values should be zero
|
||||
// This ensures the unused entries are all zero in the sizeByte
|
||||
for i := b.index; i < 4; i++ { |
||||
b.store[i] = 0 |
||||
} |
||||
length := 1 |
||||
// We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it
|
||||
b.temp[0] = 0 |
||||
for i, x := range b.store { |
||||
size := byte(0) |
||||
shifts := []byte{24, 16, 8, 0} |
||||
for _, shift := range shifts { |
||||
// Always writes at least one byte -- the first one (shift = 0)
|
||||
// Will write more bytes until the rest of the integer is all zeroes
|
||||
if (x>>shift) != 0 || shift == 0 { |
||||
size += 1 |
||||
b.temp[length] = byte(x >> shift) |
||||
length += 1 |
||||
} |
||||
} |
||||
// We store the size in two of the eight bits in the first byte (sizeByte)
|
||||
// 0 means there is one byte in total, hence why we subtract one from size
|
||||
b.temp[0] |= (size - 1) << (uint8(3-i) * 2) |
||||
} |
||||
// If we're flushing without a full group of four, remove the unused bytes we computed
|
||||
// This enables us to realize it's a partial group on decoding thanks to EOF
|
||||
if b.index != 4 { |
||||
length -= 4 - b.index |
||||
} |
||||
_, err := b.w.Write(b.temp[:length]) |
||||
return length, err |
||||
} |
||||
|
||||
func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) { |
||||
bytesWritten := 0 |
||||
b.store[b.index] = x |
||||
b.index += 1 |
||||
if b.index == 4 { |
||||
n, err := b.Flush() |
||||
if err != nil { |
||||
return n, err |
||||
} |
||||
bytesWritten += n |
||||
b.index = 0 |
||||
} |
||||
return bytesWritten, nil |
||||
} |
||||
|
||||
func (b *U32GroupVarintEncoder) Close() { |
||||
// On Close, we flush any remaining values that might not have been in a full group
|
||||
b.Flush() |
||||
} |
||||
|
||||
///
|
||||
|
||||
type U32GroupVarintDecoder struct { |
||||
r io.ByteReader |
||||
group [4]uint32 |
||||
pos int |
||||
finished bool |
||||
capacity int |
||||
} |
||||
|
||||
func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder { |
||||
return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4} |
||||
} |
||||
|
||||
func (b *U32GroupVarintDecoder) getGroup() error { |
||||
// We should always receive a sizeByte if there are more values to read
|
||||
sizeByte, err := b.r.ReadByte() |
||||
if err != nil { |
||||
return err |
||||
} |
||||
// Calculate the size of the four incoming 32 bit integers
|
||||
// 0b00 means 1 byte to read, 0b01 = 2, etc
|
||||
b.group[0] = uint32((sizeByte >> 6) & 3) |
||||
b.group[1] = uint32((sizeByte >> 4) & 3) |
||||
b.group[2] = uint32((sizeByte >> 2) & 3) |
||||
b.group[3] = uint32(sizeByte & 3) |
||||
//
|
||||
for index, size := range b.group { |
||||
b.group[index] = 0 |
||||
// Any error that occurs in earlier byte reads should be repeated at the end one
|
||||
// Hence we only catch and report the final ReadByte's error
|
||||
var err error |
||||
switch size { |
||||
case 0: |
||||
var x byte |
||||
x, err = b.r.ReadByte() |
||||
b.group[index] = uint32(x) |
||||
case 1: |
||||
var x, y byte |
||||
x, _ = b.r.ReadByte() |
||||
y, err = b.r.ReadByte() |
||||
b.group[index] = uint32(x)<<8 | uint32(y) |
||||
case 2: |
||||
var x, y, z byte |
||||
x, _ = b.r.ReadByte() |
||||
y, _ = b.r.ReadByte() |
||||
z, err = b.r.ReadByte() |
||||
b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z) |
||||
case 3: |
||||
var x, y, z, zz byte |
||||
x, _ = b.r.ReadByte() |
||||
y, _ = b.r.ReadByte() |
||||
z, _ = b.r.ReadByte() |
||||
zz, err = b.r.ReadByte() |
||||
b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz) |
||||
} |
||||
if err != nil { |
||||
if err == io.EOF { |
||||
// If we hit EOF here, we have found a partial group
|
||||
// We've return any valid entries we have read and return EOF once we run out
|
||||
b.capacity = index |
||||
b.finished = true |
||||
break |
||||
} else { |
||||
return err |
||||
} |
||||
} |
||||
} |
||||
// Reset the pos pointer to the beginning of the read values
|
||||
b.pos = 0 |
||||
return nil |
||||
} |
||||
|
||||
func (b *U32GroupVarintDecoder) GetU32() (uint32, error) { |
||||
// Check if we have any more values to give out - if not, let's get them
|
||||
if b.pos == b.capacity { |
||||
// If finished is set, there is nothing else to do
|
||||
if b.finished { |
||||
return 0, io.EOF |
||||
} |
||||
err := b.getGroup() |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
} |
||||
// Increment pointer and return the value stored at that point
|
||||
b.pos += 1 |
||||
return b.group[b.pos-1], nil |
||||
} |
||||
|
||||
///
|
||||
|
||||
type Base128Encoder struct { |
||||
w io.Writer |
||||
tmpBytes []byte |
||||
} |
||||
|
||||
func NewU32Base128Encoder(w io.Writer) *Base128Encoder { |
||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)} |
||||
} |
||||
func NewU64Base128Encoder(w io.Writer) *Base128Encoder { |
||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)} |
||||
} |
||||
|
||||
func (b *Base128Encoder) PutU32(x uint32) (int, error) { |
||||
writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x)) |
||||
return b.w.Write(b.tmpBytes[:writtenBytes]) |
||||
} |
||||
|
||||
func (b *Base128Encoder) PutU64(x uint64) (int, error) { |
||||
writtenBytes := binary.PutUvarint(b.tmpBytes, x) |
||||
return b.w.Write(b.tmpBytes[:writtenBytes]) |
||||
} |
||||
|
||||
func (b *Base128Encoder) Close() { |
||||
} |
||||
|
||||
///
|
||||
|
||||
type Base128Decoder struct { |
||||
r io.ByteReader |
||||
} |
||||
|
||||
func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } |
||||
func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } |
||||
|
||||
func (b *Base128Decoder) GetU32() (uint32, error) { |
||||
v, err := binary.ReadUvarint(b.r) |
||||
return uint32(v), err |
||||
} |
||||
|
||||
func (b *Base128Decoder) GetU64() (uint64, error) { |
||||
return binary.ReadUvarint(b.r) |
||||
} |
@ -0,0 +1,174 @@ |
||||
// The code here was obtained from:
|
||||
// https://github.com/mmcloughlin/geohash
|
||||
|
||||
// The MIT License (MIT)
|
||||
// Copyright (c) 2015 Michael McLoughlin
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
package geo |
||||
|
||||
import ( |
||||
"math" |
||||
) |
||||
|
||||
// encoding encapsulates an encoding defined by a given base32 alphabet.
|
||||
type encoding struct { |
||||
enc string |
||||
dec [256]byte |
||||
} |
||||
|
||||
// newEncoding constructs a new encoding defined by the given alphabet,
|
||||
// which must be a 32-byte string.
|
||||
func newEncoding(encoder string) *encoding { |
||||
e := new(encoding) |
||||
e.enc = encoder |
||||
for i := 0; i < len(e.dec); i++ { |
||||
e.dec[i] = 0xff |
||||
} |
||||
for i := 0; i < len(encoder); i++ { |
||||
e.dec[encoder[i]] = byte(i) |
||||
} |
||||
return e |
||||
} |
||||
|
||||
// Decode string into bits of a 64-bit word. The string s may be at most 12
|
||||
// characters.
|
||||
func (e *encoding) decode(s string) uint64 { |
||||
x := uint64(0) |
||||
for i := 0; i < len(s); i++ { |
||||
x = (x << 5) | uint64(e.dec[s[i]]) |
||||
} |
||||
return x |
||||
} |
||||
|
||||
// Encode bits of 64-bit word into a string.
|
||||
func (e *encoding) encode(x uint64) string { |
||||
b := [12]byte{} |
||||
for i := 0; i < 12; i++ { |
||||
b[11-i] = e.enc[x&0x1f] |
||||
x >>= 5 |
||||
} |
||||
return string(b[:]) |
||||
} |
||||
|
||||
// Base32Encoding with the Geohash alphabet.
|
||||
var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz") |
||||
|
||||
// BoundingBox returns the region encoded by the given string geohash.
|
||||
func geoBoundingBox(hash string) geoBox { |
||||
bits := uint(5 * len(hash)) |
||||
inthash := base32encoding.decode(hash) |
||||
return geoBoundingBoxIntWithPrecision(inthash, bits) |
||||
} |
||||
|
||||
// Box represents a rectangle in latitude/longitude space.
|
||||
type geoBox struct { |
||||
minLat float64 |
||||
maxLat float64 |
||||
minLng float64 |
||||
maxLng float64 |
||||
} |
||||
|
||||
// Round returns a point inside the box, making an effort to round to minimal
|
||||
// precision.
|
||||
func (b geoBox) round() (lat, lng float64) { |
||||
x := maxDecimalPower(b.maxLat - b.minLat) |
||||
lat = math.Ceil(b.minLat/x) * x |
||||
x = maxDecimalPower(b.maxLng - b.minLng) |
||||
lng = math.Ceil(b.minLng/x) * x |
||||
return |
||||
} |
||||
|
||||
// precalculated for performance
|
||||
var exp232 = math.Exp2(32) |
||||
|
||||
// errorWithPrecision returns the error range in latitude and longitude for in
|
||||
// integer geohash with bits of precision.
|
||||
func errorWithPrecision(bits uint) (latErr, lngErr float64) { |
||||
b := int(bits) |
||||
latBits := b / 2 |
||||
lngBits := b - latBits |
||||
latErr = math.Ldexp(180.0, -latBits) |
||||
lngErr = math.Ldexp(360.0, -lngBits) |
||||
return |
||||
} |
||||
|
||||
// minDecimalPlaces returns the minimum number of decimal places such that
|
||||
// there must exist an number with that many places within any range of width
|
||||
// r. This is intended for returning minimal precision coordinates inside a
|
||||
// box.
|
||||
func maxDecimalPower(r float64) float64 { |
||||
m := int(math.Floor(math.Log10(r))) |
||||
return math.Pow10(m) |
||||
} |
||||
|
||||
// Encode the position of x within the range -r to +r as a 32-bit integer.
|
||||
func encodeRange(x, r float64) uint32 { |
||||
p := (x + r) / (2 * r) |
||||
return uint32(p * exp232) |
||||
} |
||||
|
||||
// Decode the 32-bit range encoding X back to a value in the range -r to +r.
|
||||
func decodeRange(X uint32, r float64) float64 { |
||||
p := float64(X) / exp232 |
||||
x := 2*r*p - r |
||||
return x |
||||
} |
||||
|
||||
// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are
|
||||
// ignored, and may take any value.
|
||||
func squash(X uint64) uint32 { |
||||
X &= 0x5555555555555555 |
||||
X = (X | (X >> 1)) & 0x3333333333333333 |
||||
X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f |
||||
X = (X | (X >> 4)) & 0x00ff00ff00ff00ff |
||||
X = (X | (X >> 8)) & 0x0000ffff0000ffff |
||||
X = (X | (X >> 16)) & 0x00000000ffffffff |
||||
return uint32(X) |
||||
} |
||||
|
||||
// Deinterleave the bits of X into 32-bit words containing the even and odd
|
||||
// bitlevels of X, respectively.
|
||||
func deinterleave(X uint64) (uint32, uint32) { |
||||
return squash(X), squash(X >> 1) |
||||
} |
||||
|
||||
// BoundingBoxIntWithPrecision returns the region encoded by the integer
|
||||
// geohash with the specified precision.
|
||||
func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox { |
||||
fullHash := hash << (64 - bits) |
||||
latInt, lngInt := deinterleave(fullHash) |
||||
lat := decodeRange(latInt, 90) |
||||
lng := decodeRange(lngInt, 180) |
||||
latErr, lngErr := errorWithPrecision(bits) |
||||
return geoBox{ |
||||
minLat: lat, |
||||
maxLat: lat + latErr, |
||||
minLng: lng, |
||||
maxLng: lng + lngErr, |
||||
} |
||||
} |
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
// Decode the string geohash to a (lat, lng) point.
|
||||
func GeoHashDecode(hash string) (lat, lng float64) { |
||||
box := geoBoundingBox(hash) |
||||
return box.round() |
||||
} |
@ -0,0 +1,420 @@ |
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package scorch |
||||
|
||||
import ( |
||||
"fmt" |
||||
|
||||
"github.com/RoaringBitmap/roaring" |
||||
|
||||
"github.com/blevesearch/bleve/index" |
||||
"github.com/blevesearch/bleve/index/scorch/segment" |
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap" |
||||
) |
||||
|
||||
var OptimizeConjunction = true |
||||
var OptimizeConjunctionUnadorned = true |
||||
var OptimizeDisjunctionUnadorned = true |
||||
|
||||
func (s *IndexSnapshotTermFieldReader) Optimize(kind string, |
||||
octx index.OptimizableContext) (index.OptimizableContext, error) { |
||||
if OptimizeConjunction && kind == "conjunction" { |
||||
return s.optimizeConjunction(octx) |
||||
} |
||||
|
||||
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" { |
||||
return s.optimizeConjunctionUnadorned(octx) |
||||
} |
||||
|
||||
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" { |
||||
return s.optimizeDisjunctionUnadorned(octx) |
||||
} |
||||
|
||||
return octx, nil |
||||
} |
||||
|
||||
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256) |
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeConjunction( |
||||
octx index.OptimizableContext) (index.OptimizableContext, error) { |
||||
if octx == nil { |
||||
octx = &OptimizeTFRConjunction{snapshot: s.snapshot} |
||||
} |
||||
|
||||
o, ok := octx.(*OptimizeTFRConjunction) |
||||
if !ok { |
||||
return octx, nil |
||||
} |
||||
|
||||
if o.snapshot != s.snapshot { |
||||
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots") |
||||
} |
||||
|
||||
o.tfrs = append(o.tfrs, s) |
||||
|
||||
return o, nil |
||||
} |
||||
|
||||
type OptimizeTFRConjunction struct { |
||||
snapshot *IndexSnapshot |
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader |
||||
} |
||||
|
||||
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { |
||||
if len(o.tfrs) <= 1 { |
||||
return nil, nil |
||||
} |
||||
|
||||
for i := range o.snapshot.segment { |
||||
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator) |
||||
if !ok || itr0.ActualBM == nil { |
||||
continue |
||||
} |
||||
|
||||
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator) |
||||
if !ok || itr1.ActualBM == nil { |
||||
continue |
||||
} |
||||
|
||||
bm := roaring.And(itr0.ActualBM, itr1.ActualBM) |
||||
|
||||
for _, tfr := range o.tfrs[2:] { |
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||
if !ok || itr.ActualBM == nil { |
||||
continue |
||||
} |
||||
|
||||
bm.And(itr.ActualBM) |
||||
} |
||||
|
||||
// in this conjunction optimization, the postings iterators
|
||||
// will all share the same AND'ed together actual bitmap. The
|
||||
// regular conjunction searcher machinery will still be used,
|
||||
// but the underlying bitmap will be smaller.
|
||||
for _, tfr := range o.tfrs { |
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||
if ok && itr.ActualBM != nil { |
||||
itr.ActualBM = bm |
||||
itr.Actual = bm.Iterator() |
||||
} |
||||
} |
||||
} |
||||
|
||||
return nil, nil |
||||
} |
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// An "unadorned" conjunction optimization is appropriate when
|
||||
// additional or subsidiary information like freq-norm's and
|
||||
// term-vectors are not required, and instead only the internal-id's
|
||||
// are needed.
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned( |
||||
octx index.OptimizableContext) (index.OptimizableContext, error) { |
||||
if octx == nil { |
||||
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot} |
||||
} |
||||
|
||||
o, ok := octx.(*OptimizeTFRConjunctionUnadorned) |
||||
if !ok { |
||||
return nil, nil |
||||
} |
||||
|
||||
if o.snapshot != s.snapshot { |
||||
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots") |
||||
} |
||||
|
||||
o.tfrs = append(o.tfrs, s) |
||||
|
||||
return o, nil |
||||
} |
||||
|
||||
type OptimizeTFRConjunctionUnadorned struct { |
||||
snapshot *IndexSnapshot |
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader |
||||
} |
||||
|
||||
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>") |
||||
var OptimizeTFRConjunctionUnadornedField = "*" |
||||
|
||||
// Finish of an unadorned conjunction optimization will compute a
|
||||
// termFieldReader with an "actual" bitmap that represents the
|
||||
// constituent bitmaps AND'ed together. This termFieldReader cannot
|
||||
// provide any freq-norm or termVector associated information.
|
||||
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) { |
||||
if len(o.tfrs) <= 1 { |
||||
return nil, nil |
||||
} |
||||
|
||||
// We use an artificial term and field because the optimized
|
||||
// termFieldReader can represent multiple terms and fields.
|
||||
oTFR := &IndexSnapshotTermFieldReader{ |
||||
term: OptimizeTFRConjunctionUnadornedTerm, |
||||
field: OptimizeTFRConjunctionUnadornedField, |
||||
snapshot: o.snapshot, |
||||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), |
||||
segmentOffset: 0, |
||||
includeFreq: false, |
||||
includeNorm: false, |
||||
includeTermVectors: false, |
||||
} |
||||
|
||||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||
|
||||
OUTER: |
||||
for i := range o.snapshot.segment { |
||||
actualBMs = actualBMs[:0] |
||||
|
||||
var docNum1HitLast uint64 |
||||
var docNum1HitLastOk bool |
||||
|
||||
for _, tfr := range o.tfrs { |
||||
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok { |
||||
// An empty postings iterator means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||
continue OUTER |
||||
} |
||||
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||
if !ok { |
||||
// We optimize zap postings iterators only.
|
||||
return nil, nil |
||||
} |
||||
|
||||
// If the postings iterator is "1-hit" optimized, then we
|
||||
// can perform several optimizations up-front here.
|
||||
docNum1Hit, ok := itr.DocNum1Hit() |
||||
if ok { |
||||
if docNum1Hit == zap.DocNum1HitFinished { |
||||
// An empty docNum here means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||
continue OUTER |
||||
} |
||||
|
||||
if docNum1HitLastOk && docNum1HitLast != docNum1Hit { |
||||
// The docNum1Hit doesn't match the previous
|
||||
// docNum1HitLast, so the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||
continue OUTER |
||||
} |
||||
|
||||
docNum1HitLast = docNum1Hit |
||||
docNum1HitLastOk = true |
||||
|
||||
continue |
||||
} |
||||
|
||||
if itr.ActualBM == nil { |
||||
// An empty actual bitmap means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||
continue OUTER |
||||
} |
||||
|
||||
// Collect the actual bitmap for more processing later.
|
||||
actualBMs = append(actualBMs, itr.ActualBM) |
||||
} |
||||
|
||||
if docNum1HitLastOk { |
||||
// We reach here if all the 1-hit optimized posting
|
||||
// iterators had the same 1-hit docNum, so we can check if
|
||||
// our collected actual bitmaps also have that docNum.
|
||||
for _, bm := range actualBMs { |
||||
if !bm.Contains(uint32(docNum1HitLast)) { |
||||
// The docNum1Hit isn't in one of our actual
|
||||
// bitmaps, so the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||
continue OUTER |
||||
} |
||||
} |
||||
|
||||
// The actual bitmaps and docNum1Hits all contain or have
|
||||
// the same 1-hit docNum, so that's our AND'ed result.
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit( |
||||
docNum1HitLast, zap.NormBits1Hit, false, false) |
||||
if err != nil { |
||||
return nil, nil |
||||
} |
||||
|
||||
continue OUTER |
||||
} |
||||
|
||||
if len(actualBMs) == 0 { |
||||
// If we've collected no actual bitmaps at this point,
|
||||
// then the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||
continue OUTER |
||||
} |
||||
|
||||
if len(actualBMs) == 1 { |
||||
// If we've only 1 actual bitmap, then that's our result.
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( |
||||
actualBMs[0], false, false) |
||||
if err != nil { |
||||
return nil, nil |
||||
} |
||||
|
||||
continue OUTER |
||||
} |
||||
|
||||
// Else, AND together our collected bitmaps as our result.
|
||||
bm := roaring.And(actualBMs[0], actualBMs[1]) |
||||
|
||||
for _, actualBM := range actualBMs[2:] { |
||||
bm.And(actualBM) |
||||
} |
||||
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( |
||||
bm, false, false) |
||||
if err != nil { |
||||
return nil, nil |
||||
} |
||||
} |
||||
|
||||
return oTFR, nil |
||||
} |
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// An "unadorned" disjunction optimization is appropriate when
|
||||
// additional or subsidiary information like freq-norm's and
|
||||
// term-vectors are not required, and instead only the internal-id's
|
||||
// are needed.
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( |
||||
octx index.OptimizableContext) (index.OptimizableContext, error) { |
||||
if octx == nil { |
||||
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot} |
||||
} |
||||
|
||||
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned) |
||||
if !ok { |
||||
return nil, nil |
||||
} |
||||
|
||||
if o.snapshot != s.snapshot { |
||||
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots") |
||||
} |
||||
|
||||
o.tfrs = append(o.tfrs, s) |
||||
|
||||
return o, nil |
||||
} |
||||
|
||||
type OptimizeTFRDisjunctionUnadorned struct { |
||||
snapshot *IndexSnapshot |
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader |
||||
} |
||||
|
||||
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>") |
||||
var OptimizeTFRDisjunctionUnadornedField = "*" |
||||
|
||||
// Finish of an unadorned disjunction optimization will compute a
|
||||
// termFieldReader with an "actual" bitmap that represents the
|
||||
// constituent bitmaps OR'ed together. This termFieldReader cannot
|
||||
// provide any freq-norm or termVector associated information.
|
||||
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) { |
||||
if len(o.tfrs) <= 1 { |
||||
return nil, nil |
||||
} |
||||
|
||||
for i := range o.snapshot.segment { |
||||
var cMax uint64 |
||||
|
||||
for _, tfr := range o.tfrs { |
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||
if !ok { |
||||
return nil, nil |
||||
} |
||||
|
||||
if itr.ActualBM != nil { |
||||
c := itr.ActualBM.GetCardinality() |
||||
if cMax < c { |
||||
cMax = c |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Heuristic to skip the optimization if all the constituent
|
||||
// bitmaps are too small, where the processing & resource
|
||||
// overhead to create the OR'ed bitmap outweighs the benefit.
|
||||
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality { |
||||
return nil, nil |
||||
} |
||||
} |
||||
|
||||
// We use an artificial term and field because the optimized
|
||||
// termFieldReader can represent multiple terms and fields.
|
||||
oTFR := &IndexSnapshotTermFieldReader{ |
||||
term: OptimizeTFRDisjunctionUnadornedTerm, |
||||
field: OptimizeTFRDisjunctionUnadornedField, |
||||
snapshot: o.snapshot, |
||||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), |
||||
segmentOffset: 0, |
||||
includeFreq: false, |
||||
includeNorm: false, |
||||
includeTermVectors: false, |
||||
} |
||||
|
||||
var docNums []uint32 // Collected docNum's from 1-hit posting lists.
|
||||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||
|
||||
for i := range o.snapshot.segment { |
||||
docNums = docNums[:0] |
||||
actualBMs = actualBMs[:0] |
||||
|
||||
for _, tfr := range o.tfrs { |
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||
if !ok { |
||||
return nil, nil |
||||
} |
||||
|
||||
docNum, ok := itr.DocNum1Hit() |
||||
if ok { |
||||
docNums = append(docNums, uint32(docNum)) |
||||
continue |
||||
} |
||||
|
||||
if itr.ActualBM != nil { |
||||
actualBMs = append(actualBMs, itr.ActualBM) |
||||
} |
||||
} |
||||
|
||||
var bm *roaring.Bitmap |
||||
if len(actualBMs) > 2 { |
||||
bm = roaring.HeapOr(actualBMs...) |
||||
} else if len(actualBMs) == 2 { |
||||
bm = roaring.Or(actualBMs[0], actualBMs[1]) |
||||
} else if len(actualBMs) == 1 { |
||||
bm = actualBMs[0].Clone() |
||||
} |
||||
|
||||
if bm == nil { |
||||
bm = roaring.New() |
||||
} |
||||
|
||||
bm.AddMany(docNums) |
||||
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false) |
||||
if err != nil { |
||||
return nil, nil |
||||
} |
||||
} |
||||
|
||||
return oTFR, nil |
||||
} |
@ -1,110 +0,0 @@ |
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package scorch |
||||
|
||||
import ( |
||||
"github.com/blevesearch/bleve/document" |
||||
"github.com/blevesearch/bleve/index" |
||||
) |
||||
|
||||
type Reader struct { |
||||
root *IndexSnapshot // Owns 1 ref-count on the index snapshot.
|
||||
} |
||||
|
||||
func (r *Reader) TermFieldReader(term []byte, field string, includeFreq, |
||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { |
||||
return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors) |
||||
} |
||||
|
||||
// DocIDReader returns an iterator over all doc ids
|
||||
// The caller must close returned instance to release associated resources.
|
||||
func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) { |
||||
return r.root.DocIDReaderAll() |
||||
} |
||||
|
||||
func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { |
||||
return r.root.DocIDReaderOnly(ids) |
||||
} |
||||
|
||||
func (r *Reader) FieldDict(field string) (index.FieldDict, error) { |
||||
return r.root.FieldDict(field) |
||||
} |
||||
|
||||
// FieldDictRange is currently defined to include the start and end terms
|
||||
func (r *Reader) FieldDictRange(field string, startTerm []byte, |
||||
endTerm []byte) (index.FieldDict, error) { |
||||
return r.root.FieldDictRange(field, startTerm, endTerm) |
||||
} |
||||
|
||||
func (r *Reader) FieldDictPrefix(field string, |
||||
termPrefix []byte) (index.FieldDict, error) { |
||||
return r.root.FieldDictPrefix(field, termPrefix) |
||||
} |
||||
|
||||
func (r *Reader) Document(id string) (*document.Document, error) { |
||||
return r.root.Document(id) |
||||
} |
||||
func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, |
||||
visitor index.DocumentFieldTermVisitor) error { |
||||
return r.root.DocumentVisitFieldTerms(id, fields, visitor) |
||||
} |
||||
|
||||
func (r *Reader) Fields() ([]string, error) { |
||||
return r.root.Fields() |
||||
} |
||||
|
||||
func (r *Reader) GetInternal(key []byte) ([]byte, error) { |
||||
return r.root.GetInternal(key) |
||||
} |
||||
|
||||
func (r *Reader) DocCount() (uint64, error) { |
||||
return r.root.DocCount() |
||||
} |
||||
|
||||
func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) { |
||||
return r.root.ExternalID(id) |
||||
} |
||||
|
||||
func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { |
||||
return r.root.InternalID(id) |
||||
} |
||||
|
||||
func (r *Reader) DumpAll() chan interface{} { |
||||
rv := make(chan interface{}) |
||||
go func() { |
||||
close(rv) |
||||
}() |
||||
return rv |
||||
} |
||||
|
||||
func (r *Reader) DumpDoc(id string) chan interface{} { |
||||
rv := make(chan interface{}) |
||||
go func() { |
||||
close(rv) |
||||
}() |
||||
return rv |
||||
} |
||||
|
||||
func (r *Reader) DumpFields() chan interface{} { |
||||
rv := make(chan interface{}) |
||||
go func() { |
||||
close(rv) |
||||
}() |
||||
return rv |
||||
} |
||||
|
||||
func (r *Reader) Close() error { |
||||
return r.root.DecRef() |
||||
} |
@ -1,321 +0,0 @@ |
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem |
||||
|
||||
import ( |
||||
"math" |
||||
"sort" |
||||
|
||||
"github.com/RoaringBitmap/roaring" |
||||
"github.com/blevesearch/bleve/analysis" |
||||
"github.com/blevesearch/bleve/document" |
||||
"github.com/blevesearch/bleve/index" |
||||
) |
||||
|
||||
// NewFromAnalyzedDocs places the analyzed document mutations into a new segment
|
||||
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { |
||||
s := New() |
||||
|
||||
// ensure that _id field get fieldID 0
|
||||
s.getOrDefineField("_id") |
||||
|
||||
// fill Dicts/DictKeys and preallocate memory
|
||||
s.initializeDict(results) |
||||
|
||||
// walk each doc
|
||||
for _, result := range results { |
||||
s.processDocument(result) |
||||
} |
||||
|
||||
// go back and sort the dictKeys
|
||||
for _, dict := range s.DictKeys { |
||||
sort.Strings(dict) |
||||
} |
||||
|
||||
// compute memory usage of segment
|
||||
s.updateSizeInBytes() |
||||
|
||||
// professional debugging
|
||||
//
|
||||
// log.Printf("fields: %v\n", s.FieldsMap)
|
||||
// log.Printf("fieldsInv: %v\n", s.FieldsInv)
|
||||
// log.Printf("fieldsLoc: %v\n", s.FieldsLoc)
|
||||
// log.Printf("dicts: %v\n", s.Dicts)
|
||||
// log.Printf("dict keys: %v\n", s.DictKeys)
|
||||
// for i, posting := range s.Postings {
|
||||
// log.Printf("posting %d: %v\n", i, posting)
|
||||
// }
|
||||
// for i, freq := range s.Freqs {
|
||||
// log.Printf("freq %d: %v\n", i, freq)
|
||||
// }
|
||||
// for i, norm := range s.Norms {
|
||||
// log.Printf("norm %d: %v\n", i, norm)
|
||||
// }
|
||||
// for i, field := range s.Locfields {
|
||||
// log.Printf("field %d: %v\n", i, field)
|
||||
// }
|
||||
// for i, start := range s.Locstarts {
|
||||
// log.Printf("start %d: %v\n", i, start)
|
||||
// }
|
||||
// for i, end := range s.Locends {
|
||||
// log.Printf("end %d: %v\n", i, end)
|
||||
// }
|
||||
// for i, pos := range s.Locpos {
|
||||
// log.Printf("pos %d: %v\n", i, pos)
|
||||
// }
|
||||
// for i, apos := range s.Locarraypos {
|
||||
// log.Printf("apos %d: %v\n", i, apos)
|
||||
// }
|
||||
// log.Printf("stored: %v\n", s.Stored)
|
||||
// log.Printf("stored types: %v\n", s.StoredTypes)
|
||||
// log.Printf("stored pos: %v\n", s.StoredPos)
|
||||
|
||||
return s |
||||
} |
||||
|
||||
// fill Dicts/DictKeys and preallocate memory for postings
|
||||
func (s *Segment) initializeDict(results []*index.AnalysisResult) { |
||||
var numPostingsLists int |
||||
|
||||
numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
||||
numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
||||
|
||||
var numTokenFrequencies int |
||||
var totLocs int |
||||
|
||||
// initial scan for all fieldID's to sort them
|
||||
for _, result := range results { |
||||
for _, field := range result.Document.CompositeFields { |
||||
s.getOrDefineField(field.Name()) |
||||
} |
||||
for _, field := range result.Document.Fields { |
||||
s.getOrDefineField(field.Name()) |
||||
} |
||||
} |
||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) |
||||
for fieldID, fieldName := range s.FieldsInv { |
||||
s.FieldsMap[fieldName] = uint16(fieldID + 1) |
||||
} |
||||
|
||||
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { |
||||
for term, tf := range tfs { |
||||
pidPlus1, exists := s.Dicts[fieldID][term] |
||||
if !exists { |
||||
numPostingsLists++ |
||||
pidPlus1 = uint64(numPostingsLists) |
||||
s.Dicts[fieldID][term] = pidPlus1 |
||||
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) |
||||
numTermsPerPostingsList = append(numTermsPerPostingsList, 0) |
||||
numLocsPerPostingsList = append(numLocsPerPostingsList, 0) |
||||
} |
||||
pid := pidPlus1 - 1 |
||||
numTermsPerPostingsList[pid] += 1 |
||||
numLocsPerPostingsList[pid] += len(tf.Locations) |
||||
totLocs += len(tf.Locations) |
||||
} |
||||
numTokenFrequencies += len(tfs) |
||||
} |
||||
|
||||
for _, result := range results { |
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
_, tf := field.Analyze() |
||||
processField(fieldID, tf) |
||||
} |
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
tf := result.Analyzed[i] |
||||
processField(fieldID, tf) |
||||
} |
||||
} |
||||
|
||||
s.Postings = make([]*roaring.Bitmap, numPostingsLists) |
||||
for i := 0; i < numPostingsLists; i++ { |
||||
s.Postings[i] = roaring.New() |
||||
} |
||||
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) |
||||
for i := 0; i < numPostingsLists; i++ { |
||||
s.PostingsLocs[i] = roaring.New() |
||||
} |
||||
|
||||
// Preallocate big, contiguous backing arrays.
|
||||
auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos.
|
||||
uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos.
|
||||
float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms.
|
||||
uint16Backing := make([]uint16, totLocs) // For sub-Locfields.
|
||||
|
||||
// Point top-level slices to the backing arrays.
|
||||
s.Freqs = auint64Backing[0:numPostingsLists] |
||||
auint64Backing = auint64Backing[numPostingsLists:] |
||||
|
||||
s.Norms = make([][]float32, numPostingsLists) |
||||
|
||||
s.Locfields = make([][]uint16, numPostingsLists) |
||||
|
||||
s.Locstarts = auint64Backing[0:numPostingsLists] |
||||
auint64Backing = auint64Backing[numPostingsLists:] |
||||
|
||||
s.Locends = auint64Backing[0:numPostingsLists] |
||||
auint64Backing = auint64Backing[numPostingsLists:] |
||||
|
||||
s.Locpos = auint64Backing[0:numPostingsLists] |
||||
auint64Backing = auint64Backing[numPostingsLists:] |
||||
|
||||
s.Locarraypos = make([][][]uint64, numPostingsLists) |
||||
|
||||
// Point sub-slices to the backing arrays.
|
||||
for pid, numTerms := range numTermsPerPostingsList { |
||||
s.Freqs[pid] = uint64Backing[0:0] |
||||
uint64Backing = uint64Backing[numTerms:] |
||||
|
||||
s.Norms[pid] = float32Backing[0:0] |
||||
float32Backing = float32Backing[numTerms:] |
||||
} |
||||
|
||||
for pid, numLocs := range numLocsPerPostingsList { |
||||
s.Locfields[pid] = uint16Backing[0:0] |
||||
uint16Backing = uint16Backing[numLocs:] |
||||
|
||||
s.Locstarts[pid] = uint64Backing[0:0] |
||||
uint64Backing = uint64Backing[numLocs:] |
||||
|
||||
s.Locends[pid] = uint64Backing[0:0] |
||||
uint64Backing = uint64Backing[numLocs:] |
||||
|
||||
s.Locpos[pid] = uint64Backing[0:0] |
||||
uint64Backing = uint64Backing[numLocs:] |
||||
|
||||
s.Locarraypos[pid] = auint64Backing[0:0] |
||||
auint64Backing = auint64Backing[numLocs:] |
||||
} |
||||
} |
||||
|
||||
func (s *Segment) processDocument(result *index.AnalysisResult) { |
||||
// used to collate information across fields
|
||||
docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) |
||||
fieldLens := make(map[uint16]int, len(s.FieldsMap)) |
||||
|
||||
docNum := uint64(s.addDocument()) |
||||
|
||||
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { |
||||
fieldLens[field] += l |
||||
if existingFreqs, ok := docMap[field]; ok { |
||||
existingFreqs.MergeAll(name, tf) |
||||
} else { |
||||
docMap[field] = tf |
||||
} |
||||
} |
||||
|
||||
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { |
||||
s.Stored[docNum][field] = append(s.Stored[docNum][field], val) |
||||
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ) |
||||
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos) |
||||
} |
||||
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
l, tf := field.Analyze() |
||||
processField(fieldID, field.Name(), l, tf) |
||||
} |
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
l := result.Length[i] |
||||
tf := result.Analyzed[i] |
||||
processField(fieldID, field.Name(), l, tf) |
||||
if field.Options().IsStored() { |
||||
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) |
||||
} |
||||
|
||||
if field.Options().IncludeDocValues() { |
||||
s.DocValueFields[fieldID] = true |
||||
} |
||||
} |
||||
|
||||
// now that its been rolled up into docMap, walk that
|
||||
for fieldID, tokenFrequencies := range docMap { |
||||
for term, tokenFreq := range tokenFrequencies { |
||||
pid := s.Dicts[fieldID][term] - 1 |
||||
bs := s.Postings[pid] |
||||
bs.AddInt(int(docNum)) |
||||
s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) |
||||
s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) |
||||
locationBS := s.PostingsLocs[pid] |
||||
if len(tokenFreq.Locations) > 0 { |
||||
locationBS.AddInt(int(docNum)) |
||||
for _, loc := range tokenFreq.Locations { |
||||
var locf = fieldID |
||||
if loc.Field != "" { |
||||
locf = uint16(s.getOrDefineField(loc.Field)) |
||||
} |
||||
s.Locfields[pid] = append(s.Locfields[pid], locf) |
||||
s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) |
||||
s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) |
||||
s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) |
||||
if len(loc.ArrayPositions) > 0 { |
||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) |
||||
} else { |
||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (s *Segment) getOrDefineField(name string) int { |
||||
fieldIDPlus1, ok := s.FieldsMap[name] |
||||
if !ok { |
||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) |
||||
s.FieldsMap[name] = fieldIDPlus1 |
||||
s.FieldsInv = append(s.FieldsInv, name) |
||||
s.Dicts = append(s.Dicts, make(map[string]uint64)) |
||||
s.DictKeys = append(s.DictKeys, make([]string, 0)) |
||||
} |
||||
return int(fieldIDPlus1 - 1) |
||||
} |
||||
|
||||
func (s *Segment) addDocument() int { |
||||
docNum := len(s.Stored) |
||||
s.Stored = append(s.Stored, map[uint16][][]byte{}) |
||||
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{}) |
||||
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{}) |
||||
return docNum |
||||
} |
||||
|
||||
func encodeFieldType(f document.Field) byte { |
||||
fieldType := byte('x') |
||||
switch f.(type) { |
||||
case *document.TextField: |
||||
fieldType = 't' |
||||
case *document.NumericField: |
||||
fieldType = 'n' |
||||
case *document.DateTimeField: |
||||
fieldType = 'd' |
||||
case *document.BooleanField: |
||||
fieldType = 'b' |
||||
case *document.GeoPointField: |
||||
fieldType = 'g' |
||||
case *document.CompositeField: |
||||
fieldType = 'c' |
||||
} |
||||
return fieldType |
||||
} |
@ -1,103 +0,0 @@ |
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem |
||||
|
||||
import ( |
||||
"sort" |
||||
"strings" |
||||
|
||||
"github.com/RoaringBitmap/roaring" |
||||
"github.com/blevesearch/bleve/index" |
||||
"github.com/blevesearch/bleve/index/scorch/segment" |
||||
) |
||||
|
||||
// Dictionary is the in-memory representation of the term dictionary
|
||||
type Dictionary struct { |
||||
segment *Segment |
||||
field string |
||||
fieldID uint16 |
||||
} |
||||
|
||||
// PostingsList returns the postings list for the specified term
|
||||
func (d *Dictionary) PostingsList(term string, |
||||
except *roaring.Bitmap) (segment.PostingsList, error) { |
||||
return &PostingsList{ |
||||
dictionary: d, |
||||
term: term, |
||||
postingsID: d.segment.Dicts[d.fieldID][term], |
||||
except: except, |
||||
}, nil |
||||
} |
||||
|
||||
// Iterator returns an iterator for this dictionary
|
||||
func (d *Dictionary) Iterator() segment.DictionaryIterator { |
||||
return &DictionaryIterator{ |
||||
d: d, |
||||
} |
||||
} |
||||
|
||||
// PrefixIterator returns an iterator which only visits terms having the
|
||||
// the specified prefix
|
||||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { |
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix) |
||||
return &DictionaryIterator{ |
||||
d: d, |
||||
prefix: prefix, |
||||
offset: offset, |
||||
} |
||||
} |
||||
|
||||
// RangeIterator returns an iterator which only visits terms between the
|
||||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
|
||||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { |
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start) |
||||
return &DictionaryIterator{ |
||||
d: d, |
||||
offset: offset, |
||||
end: end, |
||||
} |
||||
} |
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct { |
||||
d *Dictionary |
||||
prefix string |
||||
end string |
||||
offset int |
||||
|
||||
dictEntry index.DictEntry // reused across Next()'s
|
||||
} |
||||
|
||||
// Next returns the next entry in the dictionary
|
||||
func (d *DictionaryIterator) Next() (*index.DictEntry, error) { |
||||
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { |
||||
return nil, nil |
||||
} |
||||
next := d.d.segment.DictKeys[d.d.fieldID][d.offset] |
||||
// check prefix
|
||||
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) { |
||||
return nil, nil |
||||
} |
||||
// check end (bleve.index API demands inclusive end)
|
||||
if d.end != "" && next > d.end { |
||||
return nil, nil |
||||
} |
||||
|
||||
d.offset++ |
||||
postingID := d.d.segment.Dicts[d.d.fieldID][next] |
||||
d.dictEntry.Term = next |
||||
d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality() |
||||
return &d.dictEntry, nil |
||||
} |
@ -1,178 +0,0 @@ |
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem |
||||
|
||||
import ( |
||||
"github.com/RoaringBitmap/roaring" |
||||
"github.com/blevesearch/bleve/index/scorch/segment" |
||||
) |
||||
|
||||
// PostingsList is an in-memory represenation of a postings list
|
||||
type PostingsList struct { |
||||
dictionary *Dictionary |
||||
term string |
||||
postingsID uint64 |
||||
except *roaring.Bitmap |
||||
} |
||||
|
||||
// Count returns the number of items on this postings list
|
||||
func (p *PostingsList) Count() uint64 { |
||||
var rv uint64 |
||||
if p.postingsID > 0 { |
||||
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality() |
||||
if p.except != nil { |
||||
except := p.except.GetCardinality() |
||||
if except > rv { |
||||
// avoid underflow
|
||||
except = rv |
||||
} |
||||
rv -= except |
||||
} |
||||
} |
||||
return rv |
||||
} |
||||
|
||||
// Iterator returns an iterator for this postings list
|
||||
func (p *PostingsList) Iterator() segment.PostingsIterator { |
||||
rv := &PostingsIterator{ |
||||
postings: p, |
||||
} |
||||
if p.postingsID > 0 { |
||||
allbits := p.dictionary.segment.Postings[p.postingsID-1] |
||||
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] |
||||
rv.all = allbits.Iterator() |
||||
if p.except != nil { |
||||
allExcept := allbits.Clone() |
||||
allExcept.AndNot(p.except) |
||||
rv.actual = allExcept.Iterator() |
||||
} else { |
||||
rv.actual = allbits.Iterator() |
||||
} |
||||
} |
||||
|
||||
return rv |
||||
} |
||||
|
||||
// PostingsIterator provides a way to iterate through the postings list
|
||||
type PostingsIterator struct { |
||||
postings *PostingsList |
||||
all roaring.IntIterable |
||||
locations *roaring.Bitmap |
||||
offset int |
||||
locoffset int |
||||
actual roaring.IntIterable |
||||
} |
||||
|
||||
// Next returns the next posting on the postings list, or nil at the end
|
||||
func (i *PostingsIterator) Next() (segment.Posting, error) { |
||||
if i.actual == nil || !i.actual.HasNext() { |
||||
return nil, nil |
||||
} |
||||
n := i.actual.Next() |
||||
allN := i.all.Next() |
||||
|
||||
// n is the next actual hit (excluding some postings)
|
||||
// allN is the next hit in the full postings
|
||||
// if they don't match, adjust offsets to factor in item we're skipping over
|
||||
// incr the all iterator, and check again
|
||||
for allN != n { |
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) |
||||
i.offset++ |
||||
allN = i.all.Next() |
||||
} |
||||
rv := &Posting{ |
||||
iterator: i, |
||||
docNum: uint64(n), |
||||
offset: i.offset, |
||||
locoffset: i.locoffset, |
||||
hasLoc: i.locations.Contains(n), |
||||
} |
||||
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) |
||||
i.offset++ |
||||
return rv, nil |
||||
} |
||||
|
||||
// Posting is a single entry in a postings list
|
||||
type Posting struct { |
||||
iterator *PostingsIterator |
||||
docNum uint64 |
||||
offset int |
||||
locoffset int |
||||
hasLoc bool |
||||
} |
||||
|
||||
// Number returns the document number of this posting in this segment
|
||||
func (p *Posting) Number() uint64 { |
||||
return p.docNum |
||||
} |
||||
|
||||
// Frequency returns the frequence of occurance of this term in this doc/field
|
||||
func (p *Posting) Frequency() uint64 { |
||||
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset] |
||||
} |
||||
|
||||
// Norm returns the normalization factor for this posting
|
||||
func (p *Posting) Norm() float64 { |
||||
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset]) |
||||
} |
||||
|
||||
// Locations returns the location information for each occurance
|
||||
func (p *Posting) Locations() []segment.Location { |
||||
if !p.hasLoc { |
||||
return nil |
||||
} |
||||
freq := int(p.Frequency()) |
||||
rv := make([]segment.Location, freq) |
||||
for i := 0; i < freq; i++ { |
||||
rv[i] = &Location{ |
||||
p: p, |
||||
offset: p.locoffset + i, |
||||
} |
||||
} |
||||
return rv |
||||
} |
||||
|
||||
// Location represents the location of a single occurance
|
||||
type Location struct { |
||||
p *Posting |
||||
offset int |
||||
} |
||||
|
||||
// Field returns the name of the field (useful in composite fields to know
|
||||
// which original field the value came from)
|
||||
func (l *Location) Field() string { |
||||
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]] |
||||
} |
||||
|
||||
// Start returns the start byte offset of this occurance
|
||||
func (l *Location) Start() uint64 { |
||||
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset] |
||||
} |
||||
|
||||
// End returns the end byte offset of this occurance
|
||||
func (l *Location) End() uint64 { |
||||
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset] |
||||
} |
||||
|
||||
// Pos returns the 1-based phrase position of this occurance
|
||||
func (l *Location) Pos() uint64 { |
||||
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset] |
||||
} |
||||
|
||||
// ArrayPositions returns the array position vector associated with this occurance
|
||||
func (l *Location) ArrayPositions() []uint64 { |
||||
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset] |
||||
} |
@ -1,289 +0,0 @@ |
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem |
||||
|
||||
import ( |
||||
"fmt" |
||||
|
||||
"github.com/RoaringBitmap/roaring" |
||||
"github.com/blevesearch/bleve/index/scorch/segment" |
||||
) |
||||
|
||||
// _id field is always guaranteed to have fieldID of 0
|
||||
const idFieldID uint16 = 0 |
||||
|
||||
// KNOWN ISSUES
|
||||
// - LIMITATION - we decided whether or not to store term vectors for a field
|
||||
// at the segment level, based on the first definition of a
|
||||
// field we see. in normal bleve usage this is fine, all
|
||||
// instances of a field definition will be the same. however,
|
||||
// advanced users may violate this and provide unique field
|
||||
// definitions with each document. this segment does not
|
||||
// support this usage.
|
||||
|
||||
// TODO
|
||||
// - need better testing of multiple docs, iterating freqs, locations and
|
||||
// and verifying the correct results are returned
|
||||
|
||||
// Segment is an in memory implementation of scorch.Segment
|
||||
type Segment struct { |
||||
|
||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||
// name -> field id + 1
|
||||
FieldsMap map[string]uint16 |
||||
|
||||
// FieldsInv is the inverse of FieldsMap
|
||||
// field id -> name
|
||||
FieldsInv []string |
||||
|
||||
// Term dictionaries for each field
|
||||
// field id -> term -> postings list id + 1
|
||||
Dicts []map[string]uint64 |
||||
|
||||
// Terms for each field, where terms are sorted ascending
|
||||
// field id -> []term
|
||||
DictKeys [][]string |
||||
|
||||
// Postings list
|
||||
// postings list id -> bitmap by docNum
|
||||
Postings []*roaring.Bitmap |
||||
|
||||
// Postings list has locations
|
||||
PostingsLocs []*roaring.Bitmap |
||||
|
||||
// Term frequencies
|
||||
// postings list id -> Freqs (one for each hit in bitmap)
|
||||
Freqs [][]uint64 |
||||
|
||||
// Field norms
|
||||
// postings list id -> Norms (one for each hit in bitmap)
|
||||
Norms [][]float32 |
||||
|
||||
// Field/start/end/pos/locarraypos
|
||||
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
||||
Locfields [][]uint16 |
||||
Locstarts [][]uint64 |
||||
Locends [][]uint64 |
||||
Locpos [][]uint64 |
||||
Locarraypos [][][]uint64 |
||||
|
||||
// Stored field values
|
||||
// docNum -> field id -> slice of values (each value []byte)
|
||||
Stored []map[uint16][][]byte |
||||
|
||||
// Stored field types
|
||||
// docNum -> field id -> slice of types (each type byte)
|
||||
StoredTypes []map[uint16][]byte |
||||
|
||||
// Stored field array positions
|
||||
// docNum -> field id -> slice of array positions (each is []uint64)
|
||||
StoredPos []map[uint16][][]uint64 |
||||
|
||||
// For storing the docValue persisted fields
|
||||
DocValueFields map[uint16]bool |
||||
|
||||
// Footprint of the segment, updated when analyzed document mutations
|
||||
// are added into the segment
|
||||
sizeInBytes uint64 |
||||
} |
||||
|
||||
// New builds a new empty Segment
|
||||
func New() *Segment { |
||||
return &Segment{ |
||||
FieldsMap: map[string]uint16{}, |
||||
DocValueFields: map[uint16]bool{}, |
||||
} |
||||
} |
||||
|
||||
func (s *Segment) updateSizeInBytes() { |
||||
var sizeInBytes uint64 |
||||
|
||||
// FieldsMap, FieldsInv
|
||||
for k, _ := range s.FieldsMap { |
||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + |
||||
2 /* size of uint16 */) |
||||
} |
||||
// overhead from the data structures
|
||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) |
||||
|
||||
// Dicts, DictKeys
|
||||
for _, entry := range s.Dicts { |
||||
for k, _ := range entry { |
||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + |
||||
8 /* size of uint64 */) |
||||
} |
||||
// overhead from the data structures
|
||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) |
||||
} |
||||
sizeInBytes += (segment.SizeOfSlice * 2) |
||||
|
||||
// Postings, PostingsLocs
|
||||
for i := 0; i < len(s.Postings); i++ { |
||||
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) + |
||||
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer) |
||||
} |
||||
sizeInBytes += (segment.SizeOfSlice * 2) |
||||
|
||||
// Freqs, Norms
|
||||
for i := 0; i < len(s.Freqs); i++ { |
||||
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + |
||||
len(s.Norms[i])*4 /* size of float32 */) + |
||||
(segment.SizeOfSlice * 2) |
||||
} |
||||
sizeInBytes += (segment.SizeOfSlice * 2) |
||||
|
||||
// Location data
|
||||
for i := 0; i < len(s.Locfields); i++ { |
||||
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + |
||||
len(s.Locstarts[i])*8 /* size of uint64 */ + |
||||
len(s.Locends[i])*8 /* size of uint64 */ + |
||||
len(s.Locpos[i])*8 /* size of uint64 */) |
||||
|
||||
for j := 0; j < len(s.Locarraypos[i]); j++ { |
||||
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) + |
||||
segment.SizeOfSlice |
||||
} |
||||
|
||||
sizeInBytes += (segment.SizeOfSlice * 5) |
||||
} |
||||
sizeInBytes += (segment.SizeOfSlice * 5) |
||||
|
||||
// Stored data
|
||||
for i := 0; i < len(s.Stored); i++ { |
||||
for _, v := range s.Stored[i] { |
||||
sizeInBytes += uint64(2 /* size of uint16 */) |
||||
for _, arr := range v { |
||||
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice |
||||
} |
||||
sizeInBytes += segment.SizeOfSlice |
||||
} |
||||
|
||||
for _, v := range s.StoredTypes[i] { |
||||
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice |
||||
} |
||||
|
||||
for _, v := range s.StoredPos[i] { |
||||
sizeInBytes += uint64(2 /* size of uint16 */) |
||||
for _, arr := range v { |
||||
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) + |
||||
segment.SizeOfSlice |
||||
} |
||||
sizeInBytes += segment.SizeOfSlice |
||||
} |
||||
|
||||
// overhead from map(s) within Stored, StoredTypes, StoredPos
|
||||
sizeInBytes += (segment.SizeOfMap * 3) |
||||
} |
||||
// overhead from data structures: Stored, StoredTypes, StoredPos
|
||||
sizeInBytes += (segment.SizeOfSlice * 3) |
||||
|
||||
// DocValueFields
|
||||
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) + |
||||
segment.SizeOfMap |
||||
|
||||
// SizeInBytes
|
||||
sizeInBytes += uint64(8) |
||||
|
||||
s.sizeInBytes = sizeInBytes |
||||
} |
||||
|
||||
func (s *Segment) SizeInBytes() uint64 { |
||||
return s.sizeInBytes |
||||
} |
||||
|
||||
func (s *Segment) AddRef() { |
||||
} |
||||
|
||||
func (s *Segment) DecRef() error { |
||||
return nil |
||||
} |
||||
|
||||
// Fields returns the field names used in this segment
|
||||
func (s *Segment) Fields() []string { |
||||
return s.FieldsInv |
||||
} |
||||
|
||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||
// for the specified doc number
|
||||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { |
||||
// ensure document number exists
|
||||
if int(num) > len(s.Stored)-1 { |
||||
return nil |
||||
} |
||||
docFields := s.Stored[int(num)] |
||||
st := s.StoredTypes[int(num)] |
||||
sp := s.StoredPos[int(num)] |
||||
for field, values := range docFields { |
||||
for i, value := range values { |
||||
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) |
||||
if !keepGoing { |
||||
return nil |
||||
} |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func (s *Segment) getField(name string) (int, error) { |
||||
fieldID, ok := s.FieldsMap[name] |
||||
if !ok { |
||||
return 0, fmt.Errorf("no field named %s", name) |
||||
} |
||||
return int(fieldID - 1), nil |
||||
} |
||||
|
||||
// Dictionary returns the term dictionary for the specified field
|
||||
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { |
||||
fieldID, err := s.getField(field) |
||||
if err != nil { |
||||
// no such field, return empty dictionary
|
||||
return &segment.EmptyDictionary{}, nil |
||||
} |
||||
return &Dictionary{ |
||||
segment: s, |
||||
field: field, |
||||
fieldID: uint16(fieldID), |
||||
}, nil |
||||
} |
||||
|
||||
// Count returns the number of documents in this segment
|
||||
// (this has no notion of deleted docs)
|
||||
func (s *Segment) Count() uint64 { |
||||
return uint64(len(s.Stored)) |
||||
} |
||||
|
||||
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
||||
// provided _id strings
|
||||
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { |
||||
rv := roaring.New() |
||||
|
||||
// guard against empty segment
|
||||
if len(s.FieldsMap) > 0 { |
||||
idDictionary := s.Dicts[idFieldID] |
||||
|
||||
for _, id := range ids { |
||||
postingID := idDictionary[id] |
||||
if postingID > 0 { |
||||
rv.Or(s.Postings[postingID-1]) |
||||
} |
||||
} |
||||
} |
||||
return rv, nil |
||||
} |
||||
|
||||
// Close releases all resources associated with this segment
|
||||
func (s *Segment) Close() error { |
||||
return nil |
||||
} |
@ -0,0 +1,75 @@ |
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package segment |
||||
|
||||
import ( |
||||
"regexp/syntax" |
||||
|
||||
"github.com/couchbase/vellum/regexp" |
||||
) |
||||
|
||||
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) { |
||||
// TODO: potential optimization where syntax.Regexp supports a Simplify() API?
|
||||
|
||||
parsed, err := syntax.Parse(pattern, syntax.Perl) |
||||
if err != nil { |
||||
return nil, nil, nil, err |
||||
} |
||||
|
||||
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit) |
||||
if err != nil { |
||||
return nil, nil, nil, err |
||||
} |
||||
|
||||
prefix := LiteralPrefix(parsed) |
||||
if prefix != "" { |
||||
prefixBeg := []byte(prefix) |
||||
prefixEnd := IncrementBytes(prefixBeg) |
||||
return re, prefixBeg, prefixEnd, nil |
||||
} |
||||
|
||||
return re, nil, nil, nil |
||||
} |
||||
|
||||
// Returns the literal prefix given the parse tree for a regexp
|
||||
func LiteralPrefix(s *syntax.Regexp) string { |
||||
// traverse the left-most branch in the parse tree as long as the
|
||||
// node represents a concatenation
|
||||
for s != nil && s.Op == syntax.OpConcat { |
||||
if len(s.Sub) < 1 { |
||||
return "" |
||||
} |
||||
|
||||
s = s.Sub[0] |
||||
} |
||||
|
||||
if s.Op == syntax.OpLiteral { |
||||
return string(s.Rune) |
||||
} |
||||
|
||||
return "" // no literal prefix
|
||||
} |
||||
|
||||
func IncrementBytes(in []byte) []byte { |
||||
rv := make([]byte, len(in)) |
||||
copy(rv, in) |
||||
for i := len(rv) - 1; i >= 0; i-- { |
||||
rv[i] = rv[i] + 1 |
||||
if rv[i] != 0 { |
||||
return rv // didn't overflow, so stop
|
||||
} |
||||
} |
||||
return nil // overflowed
|
||||
} |
@ -0,0 +1,826 @@ |
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap |
||||
|
||||
import ( |
||||
"bytes" |
||||
"encoding/binary" |
||||
"math" |
||||
"sort" |
||||
"sync" |
||||
|
||||
"github.com/RoaringBitmap/roaring" |
||||
"github.com/blevesearch/bleve/analysis" |
||||
"github.com/blevesearch/bleve/document" |
||||
"github.com/blevesearch/bleve/index" |
||||
"github.com/couchbase/vellum" |
||||
"github.com/golang/snappy" |
||||
) |
||||
|
||||
var NewSegmentBufferNumResultsBump int = 100 |
||||
var NewSegmentBufferNumResultsFactor float64 = 1.0 |
||||
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 |
||||
|
||||
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
|
||||
// SegmentBase from analysis results
|
||||
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, |
||||
chunkFactor uint32) (*SegmentBase, uint64, error) { |
||||
s := interimPool.Get().(*interim) |
||||
|
||||
var br bytes.Buffer |
||||
if s.lastNumDocs > 0 { |
||||
// use previous results to initialize the buf with an estimate
|
||||
// size, but note that the interim instance comes from a
|
||||
// global interimPool, so multiple scorch instances indexing
|
||||
// different docs can lead to low quality estimates
|
||||
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * |
||||
NewSegmentBufferNumResultsFactor) |
||||
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * |
||||
NewSegmentBufferAvgBytesPerDocFactor) |
||||
br.Grow(estimateAvgBytesPerDoc * estimateNumResults) |
||||
} |
||||
|
||||
s.results = results |
||||
s.chunkFactor = chunkFactor |
||||
s.w = NewCountHashWriter(&br) |
||||
|
||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, |
||||
err := s.convert() |
||||
if err != nil { |
||||
return nil, uint64(0), err |
||||
} |
||||
|
||||
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, |
||||
s.FieldsMap, s.FieldsInv, uint64(len(results)), |
||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) |
||||
|
||||
if err == nil && s.reset() == nil { |
||||
s.lastNumDocs = len(results) |
||||
s.lastOutSize = len(br.Bytes()) |
||||
interimPool.Put(s) |
||||
} |
||||
|
||||
return sb, uint64(len(br.Bytes())), err |
||||
} |
||||
|
||||
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} |
||||
|
||||
// interim holds temporary working data used while converting from
|
||||
// analysis results to a zap-encoded segment
|
||||
type interim struct { |
||||
results []*index.AnalysisResult |
||||
|
||||
chunkFactor uint32 |
||||
|
||||
w *CountHashWriter |
||||
|
||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||
// name -> field id + 1
|
||||
FieldsMap map[string]uint16 |
||||
|
||||
// FieldsInv is the inverse of FieldsMap
|
||||
// field id -> name
|
||||
FieldsInv []string |
||||
|
||||
// Term dictionaries for each field
|
||||
// field id -> term -> postings list id + 1
|
||||
Dicts []map[string]uint64 |
||||
|
||||
// Terms for each field, where terms are sorted ascending
|
||||
// field id -> []term
|
||||
DictKeys [][]string |
||||
|
||||
// Fields whose IncludeDocValues is true
|
||||
// field id -> bool
|
||||
IncludeDocValues []bool |
||||
|
||||
// postings id -> bitmap of docNums
|
||||
Postings []*roaring.Bitmap |
||||
|
||||
// postings id -> freq/norm's, one for each docNum in postings
|
||||
FreqNorms [][]interimFreqNorm |
||||
freqNormsBacking []interimFreqNorm |
||||
|
||||
// postings id -> locs, one for each freq
|
||||
Locs [][]interimLoc |
||||
locsBacking []interimLoc |
||||
|
||||
numTermsPerPostingsList []int // key is postings list id
|
||||
numLocsPerPostingsList []int // key is postings list id
|
||||
|
||||
builder *vellum.Builder |
||||
builderBuf bytes.Buffer |
||||
|
||||
metaBuf bytes.Buffer |
||||
|
||||
tmp0 []byte |
||||
tmp1 []byte |
||||
|
||||
lastNumDocs int |
||||
lastOutSize int |
||||
} |
||||
|
||||
func (s *interim) reset() (err error) { |
||||
s.results = nil |
||||
s.chunkFactor = 0 |
||||
s.w = nil |
||||
s.FieldsMap = nil |
||||
s.FieldsInv = nil |
||||
for i := range s.Dicts { |
||||
s.Dicts[i] = nil |
||||
} |
||||
s.Dicts = s.Dicts[:0] |
||||
for i := range s.DictKeys { |
||||
s.DictKeys[i] = s.DictKeys[i][:0] |
||||
} |
||||
s.DictKeys = s.DictKeys[:0] |
||||
for i := range s.IncludeDocValues { |
||||
s.IncludeDocValues[i] = false |
||||
} |
||||
s.IncludeDocValues = s.IncludeDocValues[:0] |
||||
for _, idn := range s.Postings { |
||||
idn.Clear() |
||||
} |
||||
s.Postings = s.Postings[:0] |
||||
s.FreqNorms = s.FreqNorms[:0] |
||||
for i := range s.freqNormsBacking { |
||||
s.freqNormsBacking[i] = interimFreqNorm{} |
||||
} |
||||
s.freqNormsBacking = s.freqNormsBacking[:0] |
||||
s.Locs = s.Locs[:0] |
||||
for i := range s.locsBacking { |
||||
s.locsBacking[i] = interimLoc{} |
||||
} |
||||
s.locsBacking = s.locsBacking[:0] |
||||
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] |
||||
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] |
||||
s.builderBuf.Reset() |
||||
if s.builder != nil { |
||||
err = s.builder.Reset(&s.builderBuf) |
||||
} |
||||
s.metaBuf.Reset() |
||||
s.tmp0 = s.tmp0[:0] |
||||
s.tmp1 = s.tmp1[:0] |
||||
s.lastNumDocs = 0 |
||||
s.lastOutSize = 0 |
||||
|
||||
return err |
||||
} |
||||
|
||||
func (s *interim) grabBuf(size int) []byte { |
||||
buf := s.tmp0 |
||||
if cap(buf) < size { |
||||
buf = make([]byte, size) |
||||
s.tmp0 = buf |
||||
} |
||||
return buf[0:size] |
||||
} |
||||
|
||||
type interimStoredField struct { |
||||
vals [][]byte |
||||
typs []byte |
||||
arrayposs [][]uint64 // array positions
|
||||
} |
||||
|
||||
type interimFreqNorm struct { |
||||
freq uint64 |
||||
norm float32 |
||||
numLocs int |
||||
} |
||||
|
||||
type interimLoc struct { |
||||
fieldID uint16 |
||||
pos uint64 |
||||
start uint64 |
||||
end uint64 |
||||
arrayposs []uint64 |
||||
} |
||||
|
||||
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { |
||||
s.FieldsMap = map[string]uint16{} |
||||
|
||||
s.getOrDefineField("_id") // _id field is fieldID 0
|
||||
|
||||
for _, result := range s.results { |
||||
for _, field := range result.Document.CompositeFields { |
||||
s.getOrDefineField(field.Name()) |
||||
} |
||||
for _, field := range result.Document.Fields { |
||||
s.getOrDefineField(field.Name()) |
||||
} |
||||
} |
||||
|
||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||
|
||||
for fieldID, fieldName := range s.FieldsInv { |
||||
s.FieldsMap[fieldName] = uint16(fieldID + 1) |
||||
} |
||||
|
||||
if cap(s.IncludeDocValues) >= len(s.FieldsInv) { |
||||
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] |
||||
} else { |
||||
s.IncludeDocValues = make([]bool, len(s.FieldsInv)) |
||||
} |
||||
|
||||
s.prepareDicts() |
||||
|
||||
for _, dict := range s.DictKeys { |
||||
sort.Strings(dict) |
||||
} |
||||
|
||||
s.processDocuments() |
||||
|
||||
storedIndexOffset, err := s.writeStoredFields() |
||||
if err != nil { |
||||
return 0, 0, 0, nil, err |
||||
} |
||||
|
||||
var fdvIndexOffset uint64 |
||||
var dictOffsets []uint64 |
||||
|
||||
if len(s.results) > 0 { |
||||
fdvIndexOffset, dictOffsets, err = s.writeDicts() |
||||
if err != nil { |
||||
return 0, 0, 0, nil, err |
||||
} |
||||
} else { |
||||
dictOffsets = make([]uint64, len(s.FieldsInv)) |
||||
} |
||||
|
||||
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) |
||||
if err != nil { |
||||
return 0, 0, 0, nil, err |
||||
} |
||||
|
||||
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil |
||||
} |
||||
|
||||
func (s *interim) getOrDefineField(fieldName string) int { |
||||
fieldIDPlus1, exists := s.FieldsMap[fieldName] |
||||
if !exists { |
||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) |
||||
s.FieldsMap[fieldName] = fieldIDPlus1 |
||||
s.FieldsInv = append(s.FieldsInv, fieldName) |
||||
|
||||
s.Dicts = append(s.Dicts, make(map[string]uint64)) |
||||
|
||||
n := len(s.DictKeys) |
||||
if n < cap(s.DictKeys) { |
||||
s.DictKeys = s.DictKeys[:n+1] |
||||
s.DictKeys[n] = s.DictKeys[n][:0] |
||||
} else { |
||||
s.DictKeys = append(s.DictKeys, []string(nil)) |
||||
} |
||||
} |
||||
|
||||
return int(fieldIDPlus1 - 1) |
||||
} |
||||
|
||||
// fill Dicts and DictKeys from analysis results
|
||||
func (s *interim) prepareDicts() { |
||||
var pidNext int |
||||
|
||||
var totTFs int |
||||
var totLocs int |
||||
|
||||
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { |
||||
dict := s.Dicts[fieldID] |
||||
dictKeys := s.DictKeys[fieldID] |
||||
|
||||
for term, tf := range tfs { |
||||
pidPlus1, exists := dict[term] |
||||
if !exists { |
||||
pidNext++ |
||||
pidPlus1 = uint64(pidNext) |
||||
|
||||
dict[term] = pidPlus1 |
||||
dictKeys = append(dictKeys, term) |
||||
|
||||
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) |
||||
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) |
||||
} |
||||
|
||||
pid := pidPlus1 - 1 |
||||
|
||||
s.numTermsPerPostingsList[pid] += 1 |
||||
s.numLocsPerPostingsList[pid] += len(tf.Locations) |
||||
|
||||
totLocs += len(tf.Locations) |
||||
} |
||||
|
||||
totTFs += len(tfs) |
||||
|
||||
s.DictKeys[fieldID] = dictKeys |
||||
} |
||||
|
||||
for _, result := range s.results { |
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
_, tf := field.Analyze() |
||||
visitField(fieldID, tf) |
||||
} |
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
tf := result.Analyzed[i] |
||||
visitField(fieldID, tf) |
||||
} |
||||
} |
||||
|
||||
numPostingsLists := pidNext |
||||
|
||||
if cap(s.Postings) >= numPostingsLists { |
||||
s.Postings = s.Postings[:numPostingsLists] |
||||
} else { |
||||
postings := make([]*roaring.Bitmap, numPostingsLists) |
||||
copy(postings, s.Postings[:cap(s.Postings)]) |
||||
for i := 0; i < numPostingsLists; i++ { |
||||
if postings[i] == nil { |
||||
postings[i] = roaring.New() |
||||
} |
||||
} |
||||
s.Postings = postings |
||||
} |
||||
|
||||
if cap(s.FreqNorms) >= numPostingsLists { |
||||
s.FreqNorms = s.FreqNorms[:numPostingsLists] |
||||
} else { |
||||
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) |
||||
} |
||||
|
||||
if cap(s.freqNormsBacking) >= totTFs { |
||||
s.freqNormsBacking = s.freqNormsBacking[:totTFs] |
||||
} else { |
||||
s.freqNormsBacking = make([]interimFreqNorm, totTFs) |
||||
} |
||||
|
||||
freqNormsBacking := s.freqNormsBacking |
||||
for pid, numTerms := range s.numTermsPerPostingsList { |
||||
s.FreqNorms[pid] = freqNormsBacking[0:0] |
||||
freqNormsBacking = freqNormsBacking[numTerms:] |
||||
} |
||||
|
||||
if cap(s.Locs) >= numPostingsLists { |
||||
s.Locs = s.Locs[:numPostingsLists] |
||||
} else { |
||||
s.Locs = make([][]interimLoc, numPostingsLists) |
||||
} |
||||
|
||||
if cap(s.locsBacking) >= totLocs { |
||||
s.locsBacking = s.locsBacking[:totLocs] |
||||
} else { |
||||
s.locsBacking = make([]interimLoc, totLocs) |
||||
} |
||||
|
||||
locsBacking := s.locsBacking |
||||
for pid, numLocs := range s.numLocsPerPostingsList { |
||||
s.Locs[pid] = locsBacking[0:0] |
||||
locsBacking = locsBacking[numLocs:] |
||||
} |
||||
} |
||||
|
||||
func (s *interim) processDocuments() { |
||||
numFields := len(s.FieldsInv) |
||||
reuseFieldLens := make([]int, numFields) |
||||
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) |
||||
|
||||
for docNum, result := range s.results { |
||||
for i := 0; i < numFields; i++ { // clear these for reuse
|
||||
reuseFieldLens[i] = 0 |
||||
reuseFieldTFs[i] = nil |
||||
} |
||||
|
||||
s.processDocument(uint64(docNum), result, |
||||
reuseFieldLens, reuseFieldTFs) |
||||
} |
||||
} |
||||
|
||||
func (s *interim) processDocument(docNum uint64, |
||||
result *index.AnalysisResult, |
||||
fieldLens []int, fieldTFs []analysis.TokenFrequencies) { |
||||
visitField := func(fieldID uint16, fieldName string, |
||||
ln int, tf analysis.TokenFrequencies) { |
||||
fieldLens[fieldID] += ln |
||||
|
||||
existingFreqs := fieldTFs[fieldID] |
||||
if existingFreqs != nil { |
||||
existingFreqs.MergeAll(fieldName, tf) |
||||
} else { |
||||
fieldTFs[fieldID] = tf |
||||
} |
||||
} |
||||
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
ln, tf := field.Analyze() |
||||
visitField(fieldID, field.Name(), ln, tf) |
||||
} |
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
ln := result.Length[i] |
||||
tf := result.Analyzed[i] |
||||
visitField(fieldID, field.Name(), ln, tf) |
||||
} |
||||
|
||||
// now that it's been rolled up into fieldTFs, walk that
|
||||
for fieldID, tfs := range fieldTFs { |
||||
dict := s.Dicts[fieldID] |
||||
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) |
||||
|
||||
for term, tf := range tfs { |
||||
pid := dict[term] - 1 |
||||
bs := s.Postings[pid] |
||||
bs.Add(uint32(docNum)) |
||||
|
||||
s.FreqNorms[pid] = append(s.FreqNorms[pid], |
||||
interimFreqNorm{ |
||||
freq: uint64(tf.Frequency()), |
||||
norm: norm, |
||||
numLocs: len(tf.Locations), |
||||
}) |
||||
|
||||
if len(tf.Locations) > 0 { |
||||
locs := s.Locs[pid] |
||||
|
||||
for _, loc := range tf.Locations { |
||||
var locf = uint16(fieldID) |
||||
if loc.Field != "" { |
||||
locf = uint16(s.getOrDefineField(loc.Field)) |
||||
} |
||||
var arrayposs []uint64 |
||||
if len(loc.ArrayPositions) > 0 { |
||||
arrayposs = loc.ArrayPositions |
||||
} |
||||
locs = append(locs, interimLoc{ |
||||
fieldID: locf, |
||||
pos: uint64(loc.Position), |
||||
start: uint64(loc.Start), |
||||
end: uint64(loc.End), |
||||
arrayposs: arrayposs, |
||||
}) |
||||
} |
||||
|
||||
s.Locs[pid] = locs |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (s *interim) writeStoredFields() ( |
||||
storedIndexOffset uint64, err error) { |
||||
varBuf := make([]byte, binary.MaxVarintLen64) |
||||
metaEncode := func(val uint64) (int, error) { |
||||
wb := binary.PutUvarint(varBuf, val) |
||||
return s.metaBuf.Write(varBuf[:wb]) |
||||
} |
||||
|
||||
data, compressed := s.tmp0[:0], s.tmp1[:0] |
||||
defer func() { s.tmp0, s.tmp1 = data, compressed }() |
||||
|
||||
// keyed by docNum
|
||||
docStoredOffsets := make([]uint64, len(s.results)) |
||||
|
||||
// keyed by fieldID, for the current doc in the loop
|
||||
docStoredFields := map[uint16]interimStoredField{} |
||||
|
||||
for docNum, result := range s.results { |
||||
for fieldID := range docStoredFields { // reset for next doc
|
||||
delete(docStoredFields, fieldID) |
||||
} |
||||
|
||||
for _, field := range result.Document.Fields { |
||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||
|
||||
opts := field.Options() |
||||
|
||||
if opts.IsStored() { |
||||
isf := docStoredFields[fieldID] |
||||
isf.vals = append(isf.vals, field.Value()) |
||||
isf.typs = append(isf.typs, encodeFieldType(field)) |
||||
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) |
||||
docStoredFields[fieldID] = isf |
||||
} |
||||
|
||||
if opts.IncludeDocValues() { |
||||
s.IncludeDocValues[fieldID] = true |
||||
} |
||||
} |
||||
|
||||
var curr int |
||||
|
||||
s.metaBuf.Reset() |
||||
data = data[:0] |
||||
|
||||
// _id field special case optimizes ExternalID() lookups
|
||||
idFieldVal := docStoredFields[uint16(0)].vals[0] |
||||
_, err = metaEncode(uint64(len(idFieldVal))) |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
|
||||
// handle non-"_id" fields
|
||||
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { |
||||
isf, exists := docStoredFields[uint16(fieldID)] |
||||
if exists { |
||||
curr, data, err = persistStoredFieldValues( |
||||
fieldID, isf.vals, isf.typs, isf.arrayposs, |
||||
curr, metaEncode, data) |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
} |
||||
} |
||||
|
||||
metaBytes := s.metaBuf.Bytes() |
||||
|
||||
compressed = snappy.Encode(compressed[:cap(compressed)], data) |
||||
|
||||
docStoredOffsets[docNum] = uint64(s.w.Count()) |
||||
|
||||
_, err := writeUvarints(s.w, |
||||
uint64(len(metaBytes)), |
||||
uint64(len(idFieldVal)+len(compressed))) |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
|
||||
_, err = s.w.Write(metaBytes) |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
|
||||
_, err = s.w.Write(idFieldVal) |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
|
||||
_, err = s.w.Write(compressed) |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
} |
||||
|
||||
storedIndexOffset = uint64(s.w.Count()) |
||||
|
||||
for _, docStoredOffset := range docStoredOffsets { |
||||
err = binary.Write(s.w, binary.BigEndian, docStoredOffset) |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
} |
||||
|
||||
return storedIndexOffset, nil |
||||
} |
||||
|
||||
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { |
||||
dictOffsets = make([]uint64, len(s.FieldsInv)) |
||||
|
||||
fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) |
||||
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) |
||||
|
||||
buf := s.grabBuf(binary.MaxVarintLen64) |
||||
|
||||
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) |
||||
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) |
||||
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false) |
||||
|
||||
var docTermMap [][]byte |
||||
|
||||
if s.builder == nil { |
||||
s.builder, err = vellum.New(&s.builderBuf, nil) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
} |
||||
|
||||
for fieldID, terms := range s.DictKeys { |
||||
if cap(docTermMap) < len(s.results) { |
||||
docTermMap = make([][]byte, len(s.results)) |
||||
} else { |
||||
docTermMap = docTermMap[0:len(s.results)] |
||||
for docNum := range docTermMap { // reset the docTermMap
|
||||
docTermMap[docNum] = docTermMap[docNum][:0] |
||||
} |
||||
} |
||||
|
||||
dict := s.Dicts[fieldID] |
||||
|
||||
for _, term := range terms { // terms are already sorted
|
||||
pid := dict[term] - 1 |
||||
|
||||
postingsBS := s.Postings[pid] |
||||
|
||||
freqNorms := s.FreqNorms[pid] |
||||
freqNormOffset := 0 |
||||
|
||||
locs := s.Locs[pid] |
||||
locOffset := 0 |
||||
|
||||
postingsItr := postingsBS.Iterator() |
||||
for postingsItr.HasNext() { |
||||
docNum := uint64(postingsItr.Next()) |
||||
|
||||
freqNorm := freqNorms[freqNormOffset] |
||||
|
||||
err = tfEncoder.Add(docNum, |
||||
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), |
||||
uint64(math.Float32bits(freqNorm.norm))) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
if freqNorm.numLocs > 0 { |
||||
numBytesLocs := 0 |
||||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { |
||||
numBytesLocs += totalUvarintBytes( |
||||
uint64(loc.fieldID), loc.pos, loc.start, loc.end, |
||||
uint64(len(loc.arrayposs)), loc.arrayposs) |
||||
} |
||||
|
||||
err = locEncoder.Add(docNum, uint64(numBytesLocs)) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { |
||||
err = locEncoder.Add(docNum, |
||||
uint64(loc.fieldID), loc.pos, loc.start, loc.end, |
||||
uint64(len(loc.arrayposs))) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
err = locEncoder.Add(docNum, loc.arrayposs...) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
} |
||||
|
||||
locOffset += freqNorm.numLocs |
||||
} |
||||
|
||||
freqNormOffset++ |
||||
|
||||
docTermMap[docNum] = append( |
||||
append(docTermMap[docNum], term...), |
||||
termSeparator) |
||||
} |
||||
|
||||
tfEncoder.Close() |
||||
locEncoder.Close() |
||||
|
||||
postingsOffset, err := |
||||
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
if postingsOffset > uint64(0) { |
||||
err = s.builder.Insert([]byte(term), postingsOffset) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
} |
||||
|
||||
tfEncoder.Reset() |
||||
locEncoder.Reset() |
||||
} |
||||
|
||||
err = s.builder.Close() |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
// record where this dictionary starts
|
||||
dictOffsets[fieldID] = uint64(s.w.Count()) |
||||
|
||||
vellumData := s.builderBuf.Bytes() |
||||
|
||||
// write out the length of the vellum data
|
||||
n := binary.PutUvarint(buf, uint64(len(vellumData))) |
||||
_, err = s.w.Write(buf[:n]) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
// write this vellum to disk
|
||||
_, err = s.w.Write(vellumData) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
// reset vellum for reuse
|
||||
s.builderBuf.Reset() |
||||
|
||||
err = s.builder.Reset(&s.builderBuf) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
// write the field doc values
|
||||
if s.IncludeDocValues[fieldID] { |
||||
for docNum, docTerms := range docTermMap { |
||||
if len(docTerms) > 0 { |
||||
err = fdvEncoder.Add(uint64(docNum), docTerms) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
} |
||||
} |
||||
err = fdvEncoder.Close() |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
fdvOffsetsStart[fieldID] = uint64(s.w.Count()) |
||||
|
||||
_, err = fdvEncoder.Write() |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
|
||||
fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) |
||||
|
||||
fdvEncoder.Reset() |
||||
} else { |
||||
fdvOffsetsStart[fieldID] = fieldNotUninverted |
||||
fdvOffsetsEnd[fieldID] = fieldNotUninverted |
||||
} |
||||
} |
||||
|
||||
fdvIndexOffset = uint64(s.w.Count()) |
||||
|
||||
for i := 0; i < len(fdvOffsetsStart); i++ { |
||||
n := binary.PutUvarint(buf, fdvOffsetsStart[i]) |
||||
_, err := s.w.Write(buf[:n]) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) |
||||
_, err = s.w.Write(buf[:n]) |
||||
if err != nil { |
||||
return 0, nil, err |
||||
} |
||||
} |
||||
|
||||
return fdvIndexOffset, dictOffsets, nil |
||||
} |
||||
|
||||
func encodeFieldType(f document.Field) byte { |
||||
fieldType := byte('x') |
||||
switch f.(type) { |
||||
case *document.TextField: |
||||
fieldType = 't' |
||||
case *document.NumericField: |
||||
fieldType = 'n' |
||||
case *document.DateTimeField: |
||||
fieldType = 'd' |
||||
case *document.BooleanField: |
||||
fieldType = 'b' |
||||
case *document.GeoPointField: |
||||
fieldType = 'g' |
||||
case *document.CompositeField: |
||||
fieldType = 'c' |
||||
} |
||||
return fieldType |
||||
} |
||||
|
||||
// returns the total # of bytes needed to encode the given uint64's
|
||||
// into binary.PutUVarint() encoding
|
||||
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { |
||||
n = numUvarintBytes(a) |
||||
n += numUvarintBytes(b) |
||||
n += numUvarintBytes(c) |
||||
n += numUvarintBytes(d) |
||||
n += numUvarintBytes(e) |
||||
for _, v := range more { |
||||
n += numUvarintBytes(v) |
||||
} |
||||
return n |
||||
} |
||||
|
||||
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
|
||||
func numUvarintBytes(x uint64) (n int) { |
||||
for x >= 0x80 { |
||||
x >>= 7 |
||||
n++ |
||||
} |
||||
return n + 1 |
||||
} |
File diff suppressed because it is too large
Load Diff
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
@ -0,0 +1,343 @@ |
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package searcher |
||||
|
||||
import ( |
||||
"bytes" |
||||
"container/heap" |
||||
"math" |
||||
"reflect" |
||||
|
||||
"github.com/blevesearch/bleve/index" |
||||
"github.com/blevesearch/bleve/search" |
||||
"github.com/blevesearch/bleve/search/scorer" |
||||
"github.com/blevesearch/bleve/size" |
||||
) |
||||
|
||||
var reflectStaticSizeDisjunctionHeapSearcher int |
||||
var reflectStaticSizeSearcherCurr int |
||||
|
||||
func init() { |
||||
var dhs DisjunctionHeapSearcher |
||||
reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size()) |
||||
|
||||
var sc SearcherCurr |
||||
reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size()) |
||||
} |
||||
|
||||
type SearcherCurr struct { |
||||
searcher search.Searcher |
||||
curr *search.DocumentMatch |
||||
} |
||||
|
||||
type DisjunctionHeapSearcher struct { |
||||
indexReader index.IndexReader |
||||
|
||||
numSearchers int |
||||
scorer *scorer.DisjunctionQueryScorer |
||||
min int |
||||
queryNorm float64 |
||||
initialized bool |
||||
searchers []search.Searcher |
||||
heap []*SearcherCurr |
||||
|
||||
matching []*search.DocumentMatch |
||||
matchingCurrs []*SearcherCurr |
||||
} |
||||
|
||||
func newDisjunctionHeapSearcher(indexReader index.IndexReader, |
||||
searchers []search.Searcher, min float64, options search.SearcherOptions, |
||||
limit bool) ( |
||||
*DisjunctionHeapSearcher, error) { |
||||
if limit && tooManyClauses(len(searchers)) { |
||||
return nil, tooManyClausesErr(len(searchers)) |
||||
} |
||||
|
||||
// build our searcher
|
||||
rv := DisjunctionHeapSearcher{ |
||||
indexReader: indexReader, |
||||
searchers: searchers, |
||||
numSearchers: len(searchers), |
||||
scorer: scorer.NewDisjunctionQueryScorer(options), |
||||
min: int(min), |
||||
matching: make([]*search.DocumentMatch, len(searchers)), |
||||
matchingCurrs: make([]*SearcherCurr, len(searchers)), |
||||
heap: make([]*SearcherCurr, 0, len(searchers)), |
||||
} |
||||
rv.computeQueryNorm() |
||||
return &rv, nil |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Size() int { |
||||
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr + |
||||
s.scorer.Size() |
||||
|
||||
for _, entry := range s.searchers { |
||||
sizeInBytes += entry.Size() |
||||
} |
||||
|
||||
for _, entry := range s.matching { |
||||
if entry != nil { |
||||
sizeInBytes += entry.Size() |
||||
} |
||||
} |
||||
|
||||
// for matchingCurrs and heap, just use static size * len
|
||||
// since searchers and document matches already counted above
|
||||
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr |
||||
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr |
||||
|
||||
return sizeInBytes |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) computeQueryNorm() { |
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0 |
||||
for _, searcher := range s.searchers { |
||||
sumOfSquaredWeights += searcher.Weight() |
||||
} |
||||
// now compute query norm from this
|
||||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) |
||||
// finally tell all the downstream searchers the norm
|
||||
for _, searcher := range s.searchers { |
||||
searcher.SetQueryNorm(s.queryNorm) |
||||
} |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error { |
||||
// alloc a single block of SearcherCurrs
|
||||
block := make([]SearcherCurr, len(s.searchers)) |
||||
|
||||
// get all searchers pointing at their first match
|
||||
for i, searcher := range s.searchers { |
||||
curr, err := searcher.Next(ctx) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
if curr != nil { |
||||
block[i].searcher = searcher |
||||
block[i].curr = curr |
||||
heap.Push(s, &block[i]) |
||||
} |
||||
} |
||||
|
||||
err := s.updateMatches() |
||||
if err != nil { |
||||
return err |
||||
} |
||||
s.initialized = true |
||||
return nil |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) updateMatches() error { |
||||
matching := s.matching[:0] |
||||
matchingCurrs := s.matchingCurrs[:0] |
||||
|
||||
if len(s.heap) > 0 { |
||||
|
||||
// top of the heap is our next hit
|
||||
next := heap.Pop(s).(*SearcherCurr) |
||||
matching = append(matching, next.curr) |
||||
matchingCurrs = append(matchingCurrs, next) |
||||
|
||||
// now as long as top of heap matches, keep popping
|
||||
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { |
||||
next = heap.Pop(s).(*SearcherCurr) |
||||
matching = append(matching, next.curr) |
||||
matchingCurrs = append(matchingCurrs, next) |
||||
} |
||||
} |
||||
|
||||
s.matching = matching |
||||
s.matchingCurrs = matchingCurrs |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Weight() float64 { |
||||
var rv float64 |
||||
for _, searcher := range s.searchers { |
||||
rv += searcher.Weight() |
||||
} |
||||
return rv |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) { |
||||
for _, searcher := range s.searchers { |
||||
searcher.SetQueryNorm(qnorm) |
||||
} |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) ( |
||||
*search.DocumentMatch, error) { |
||||
if !s.initialized { |
||||
err := s.initSearchers(ctx) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
var rv *search.DocumentMatch |
||||
found := false |
||||
for !found && len(s.matching) > 0 { |
||||
if len(s.matching) >= s.min { |
||||
found = true |
||||
// score this match
|
||||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) |
||||
} |
||||
|
||||
// invoke next on all the matching searchers
|
||||
for _, matchingCurr := range s.matchingCurrs { |
||||
if matchingCurr.curr != rv { |
||||
ctx.DocumentMatchPool.Put(matchingCurr.curr) |
||||
} |
||||
curr, err := matchingCurr.searcher.Next(ctx) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
if curr != nil { |
||||
matchingCurr.curr = curr |
||||
heap.Push(s, matchingCurr) |
||||
} |
||||
} |
||||
|
||||
err := s.updateMatches() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
return rv, nil |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, |
||||
ID index.IndexInternalID) (*search.DocumentMatch, error) { |
||||
if !s.initialized { |
||||
err := s.initSearchers(ctx) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
// if there is anything in matching, toss it back onto the heap
|
||||
for _, matchingCurr := range s.matchingCurrs { |
||||
heap.Push(s, matchingCurr) |
||||
} |
||||
s.matching = s.matching[:0] |
||||
s.matchingCurrs = s.matchingCurrs[:0] |
||||
|
||||
// find all searchers that actually need to be advanced
|
||||
// advance them, using s.matchingCurrs as temp storage
|
||||
for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { |
||||
searcherCurr := heap.Pop(s).(*SearcherCurr) |
||||
ctx.DocumentMatchPool.Put(searcherCurr.curr) |
||||
curr, err := searcherCurr.searcher.Advance(ctx, ID) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
if curr != nil { |
||||
searcherCurr.curr = curr |
||||
s.matchingCurrs = append(s.matchingCurrs, searcherCurr) |
||||
} |
||||
} |
||||
// now all of the searchers that we advanced have to be pushed back
|
||||
for _, matchingCurr := range s.matchingCurrs { |
||||
heap.Push(s, matchingCurr) |
||||
} |
||||
// reset our temp space
|
||||
s.matchingCurrs = s.matchingCurrs[:0] |
||||
|
||||
err := s.updateMatches() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
return s.Next(ctx) |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Count() uint64 { |
||||
// for now return a worst case
|
||||
var sum uint64 |
||||
for _, searcher := range s.searchers { |
||||
sum += searcher.Count() |
||||
} |
||||
return sum |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Close() (rv error) { |
||||
for _, searcher := range s.searchers { |
||||
err := searcher.Close() |
||||
if err != nil && rv == nil { |
||||
rv = err |
||||
} |
||||
} |
||||
return rv |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Min() int { |
||||
return s.min |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int { |
||||
rv := len(s.searchers) |
||||
for _, s := range s.searchers { |
||||
rv += s.DocumentMatchPoolSize() |
||||
} |
||||
return rv |
||||
} |
||||
|
||||
// a disjunction searcher implements the index.Optimizable interface
|
||||
// but only activates on an edge case where the disjunction is a
|
||||
// wrapper around a single Optimizable child searcher
|
||||
func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) ( |
||||
index.OptimizableContext, error) { |
||||
if len(s.searchers) == 1 { |
||||
o, ok := s.searchers[0].(index.Optimizable) |
||||
if ok { |
||||
return o.Optimize(kind, octx) |
||||
} |
||||
} |
||||
|
||||
return octx, nil |
||||
} |
||||
|
||||
// heap impl
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) } |
||||
|
||||
func (s *DisjunctionHeapSearcher) Less(i, j int) bool { |
||||
if s.heap[i].curr == nil { |
||||
return true |
||||
} else if s.heap[j].curr == nil { |
||||
return false |
||||
} |
||||
return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Swap(i, j int) { |
||||
s.heap[i], s.heap[j] = s.heap[j], s.heap[i] |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Push(x interface{}) { |
||||
s.heap = append(s.heap, x.(*SearcherCurr)) |
||||
} |
||||
|
||||
func (s *DisjunctionHeapSearcher) Pop() interface{} { |
||||
old := s.heap |
||||
n := len(old) |
||||
x := old[n-1] |
||||
s.heap = old[0 : n-1] |
||||
return x |
||||
} |
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
@ -0,0 +1,298 @@ |
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package searcher |
||||
|
||||
import ( |
||||
"math" |
||||
"reflect" |
||||
"sort" |
||||
|
||||
"github.com/blevesearch/bleve/index" |
||||
"github.com/blevesearch/bleve/search" |
||||
"github.com/blevesearch/bleve/search/scorer" |
||||
"github.com/blevesearch/bleve/size" |
||||
) |
||||
|
||||
var reflectStaticSizeDisjunctionSliceSearcher int |
||||
|
||||
func init() { |
||||
var ds DisjunctionSliceSearcher |
||||
reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size()) |
||||
} |
||||
|
||||
type DisjunctionSliceSearcher struct { |
||||
indexReader index.IndexReader |
||||
searchers OrderedSearcherList |
||||
numSearchers int |
||||
queryNorm float64 |
||||
currs []*search.DocumentMatch |
||||
scorer *scorer.DisjunctionQueryScorer |
||||
min int |
||||
matching []*search.DocumentMatch |
||||
matchingIdxs []int |
||||
initialized bool |
||||
} |
||||
|
||||
func newDisjunctionSliceSearcher(indexReader index.IndexReader, |
||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions, |
||||
limit bool) ( |
||||
*DisjunctionSliceSearcher, error) { |
||||
if limit && tooManyClauses(len(qsearchers)) { |
||||
return nil, tooManyClausesErr(len(qsearchers)) |
||||
} |
||||
// build the downstream searchers
|
||||
searchers := make(OrderedSearcherList, len(qsearchers)) |
||||
for i, searcher := range qsearchers { |
||||
searchers[i] = searcher |
||||
} |
||||
// sort the searchers
|
||||
sort.Sort(sort.Reverse(searchers)) |
||||
// build our searcher
|
||||
rv := DisjunctionSliceSearcher{ |
||||
indexReader: indexReader, |
||||
searchers: searchers, |
||||
numSearchers: len(searchers), |
||||
currs: make([]*search.DocumentMatch, len(searchers)), |
||||
scorer: scorer.NewDisjunctionQueryScorer(options), |
||||
min: int(min), |
||||
matching: make([]*search.DocumentMatch, len(searchers)), |
||||
matchingIdxs: make([]int, len(searchers)), |
||||
} |
||||
rv.computeQueryNorm() |
||||
return &rv, nil |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) Size() int { |
||||
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr + |
||||
s.scorer.Size() |
||||
|
||||
for _, entry := range s.searchers { |
||||
sizeInBytes += entry.Size() |
||||
} |
||||
|
||||
for _, entry := range s.currs { |
||||
if entry != nil { |
||||
sizeInBytes += entry.Size() |
||||
} |
||||
} |
||||
|
||||
for _, entry := range s.matching { |
||||
if entry != nil { |
||||
sizeInBytes += entry.Size() |
||||
} |
||||
} |
||||
|
||||
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt |
||||
|
||||
return sizeInBytes |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) computeQueryNorm() { |
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0 |
||||
for _, searcher := range s.searchers { |
||||
sumOfSquaredWeights += searcher.Weight() |
||||
} |
||||
// now compute query norm from this
|
||||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) |
||||
// finally tell all the downstream searchers the norm
|
||||
for _, searcher := range s.searchers { |
||||
searcher.SetQueryNorm(s.queryNorm) |
||||
} |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error { |
||||
var err error |
||||
// get all searchers pointing at their first match
|
||||
for i, searcher := range s.searchers { |
||||
if s.currs[i] != nil { |
||||
ctx.DocumentMatchPool.Put(s.currs[i]) |
||||
} |
||||
s.currs[i], err = searcher.Next(ctx) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
} |
||||
|
||||
err = s.updateMatches() |
||||
if err != nil { |
||||
return err |
||||
} |
||||
|
||||
s.initialized = true |
||||
return nil |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) updateMatches() error { |
||||
matching := s.matching[:0] |
||||
matchingIdxs := s.matchingIdxs[:0] |
||||
|
||||
for i := 0; i < len(s.currs); i++ { |
||||
curr := s.currs[i] |
||||
if curr == nil { |
||||
continue |
||||
} |
||||
|
||||
if len(matching) > 0 { |
||||
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) |
||||
if cmp > 0 { |
||||
continue |
||||
} |
||||
|
||||
if cmp < 0 { |
||||
matching = matching[:0] |
||||
matchingIdxs = matchingIdxs[:0] |
||||
} |
||||
} |
||||
|
||||
matching = append(matching, curr) |
||||
matchingIdxs = append(matchingIdxs, i) |
||||
} |
||||
|
||||
s.matching = matching |
||||
s.matchingIdxs = matchingIdxs |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) Weight() float64 { |
||||
var rv float64 |
||||
for _, searcher := range s.searchers { |
||||
rv += searcher.Weight() |
||||
} |
||||
return rv |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) { |
||||
for _, searcher := range s.searchers { |
||||
searcher.SetQueryNorm(qnorm) |
||||
} |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( |
||||
*search.DocumentMatch, error) { |
||||
if !s.initialized { |
||||
err := s.initSearchers(ctx) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
var err error |
||||
var rv *search.DocumentMatch |
||||
|
||||
found := false |
||||
for !found && len(s.matching) > 0 { |
||||
if len(s.matching) >= s.min { |
||||
found = true |
||||
// score this match
|
||||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) |
||||
} |
||||
|
||||
// invoke next on all the matching searchers
|
||||
for _, i := range s.matchingIdxs { |
||||
searcher := s.searchers[i] |
||||
if s.currs[i] != rv { |
||||
ctx.DocumentMatchPool.Put(s.currs[i]) |
||||
} |
||||
s.currs[i], err = searcher.Next(ctx) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
err = s.updateMatches() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
return rv, nil |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext, |
||||
ID index.IndexInternalID) (*search.DocumentMatch, error) { |
||||
if !s.initialized { |
||||
err := s.initSearchers(ctx) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
// get all searchers pointing at their first match
|
||||
var err error |
||||
for i, searcher := range s.searchers { |
||||
if s.currs[i] != nil { |
||||
if s.currs[i].IndexInternalID.Compare(ID) >= 0 { |
||||
continue |
||||
} |
||||
ctx.DocumentMatchPool.Put(s.currs[i]) |
||||
} |
||||
s.currs[i], err = searcher.Advance(ctx, ID) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
err = s.updateMatches() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
return s.Next(ctx) |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) Count() uint64 { |
||||
// for now return a worst case
|
||||
var sum uint64 |
||||
for _, searcher := range s.searchers { |
||||
sum += searcher.Count() |
||||
} |
||||
return sum |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) Close() (rv error) { |
||||
for _, searcher := range s.searchers { |
||||
err := searcher.Close() |
||||
if err != nil && rv == nil { |
||||
rv = err |
||||
} |
||||
} |
||||
return rv |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) Min() int { |
||||
return s.min |
||||
} |
||||
|
||||
func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { |
||||
rv := len(s.currs) |
||||
for _, s := range s.searchers { |
||||
rv += s.DocumentMatchPoolSize() |
||||
} |
||||
return rv |
||||
} |
||||
|
||||
// a disjunction searcher implements the index.Optimizable interface
|
||||
// but only activates on an edge case where the disjunction is a
|
||||
// wrapper around a single Optimizable child searcher
|
||||
func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) ( |
||||
index.OptimizableContext, error) { |
||||
if len(s.searchers) == 1 { |
||||
o, ok := s.searchers[0].(index.Optimizable) |
||||
if ok { |
||||
return o.Optimize(kind, octx) |
||||
} |
||||
} |
||||
|
||||
return octx, nil |
||||
} |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue