Update bleve dependency to latest master revision (#6100)
* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revisiontokarchuk/v1.17
parent
11e316654e
commit
a380cfd8e0
@ -1,22 +0,0 @@ |
|||||||
The MIT License (MIT) |
|
||||||
|
|
||||||
Copyright (c) 2015 Stephen Merity |
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy |
|
||||||
of this software and associated documentation files (the "Software"), to deal |
|
||||||
in the Software without restriction, including without limitation the rights |
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
||||||
copies of the Software, and to permit persons to whom the Software is |
|
||||||
furnished to do so, subject to the following conditions: |
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all |
|
||||||
copies or substantial portions of the Software. |
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
||||||
SOFTWARE. |
|
||||||
|
|
@ -1,229 +0,0 @@ |
|||||||
package govarint |
|
||||||
|
|
||||||
import "encoding/binary" |
|
||||||
import "io" |
|
||||||
|
|
||||||
type U32VarintEncoder interface { |
|
||||||
PutU32(x uint32) int |
|
||||||
Close() |
|
||||||
} |
|
||||||
|
|
||||||
type U32VarintDecoder interface { |
|
||||||
GetU32() (uint32, error) |
|
||||||
} |
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type U64VarintEncoder interface { |
|
||||||
PutU64(x uint64) int |
|
||||||
Close() |
|
||||||
} |
|
||||||
|
|
||||||
type U64VarintDecoder interface { |
|
||||||
GetU64() (uint64, error) |
|
||||||
} |
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type U32GroupVarintEncoder struct { |
|
||||||
w io.Writer |
|
||||||
index int |
|
||||||
store [4]uint32 |
|
||||||
temp [17]byte |
|
||||||
} |
|
||||||
|
|
||||||
func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} } |
|
||||||
|
|
||||||
func (b *U32GroupVarintEncoder) Flush() (int, error) { |
|
||||||
// TODO: Is it more efficient to have a tailored version that's called only in Close()?
|
|
||||||
// If index is zero, there are no integers to flush
|
|
||||||
if b.index == 0 { |
|
||||||
return 0, nil |
|
||||||
} |
|
||||||
// In the case we're flushing (the group isn't of size four), the non-values should be zero
|
|
||||||
// This ensures the unused entries are all zero in the sizeByte
|
|
||||||
for i := b.index; i < 4; i++ { |
|
||||||
b.store[i] = 0 |
|
||||||
} |
|
||||||
length := 1 |
|
||||||
// We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it
|
|
||||||
b.temp[0] = 0 |
|
||||||
for i, x := range b.store { |
|
||||||
size := byte(0) |
|
||||||
shifts := []byte{24, 16, 8, 0} |
|
||||||
for _, shift := range shifts { |
|
||||||
// Always writes at least one byte -- the first one (shift = 0)
|
|
||||||
// Will write more bytes until the rest of the integer is all zeroes
|
|
||||||
if (x>>shift) != 0 || shift == 0 { |
|
||||||
size += 1 |
|
||||||
b.temp[length] = byte(x >> shift) |
|
||||||
length += 1 |
|
||||||
} |
|
||||||
} |
|
||||||
// We store the size in two of the eight bits in the first byte (sizeByte)
|
|
||||||
// 0 means there is one byte in total, hence why we subtract one from size
|
|
||||||
b.temp[0] |= (size - 1) << (uint8(3-i) * 2) |
|
||||||
} |
|
||||||
// If we're flushing without a full group of four, remove the unused bytes we computed
|
|
||||||
// This enables us to realize it's a partial group on decoding thanks to EOF
|
|
||||||
if b.index != 4 { |
|
||||||
length -= 4 - b.index |
|
||||||
} |
|
||||||
_, err := b.w.Write(b.temp[:length]) |
|
||||||
return length, err |
|
||||||
} |
|
||||||
|
|
||||||
func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) { |
|
||||||
bytesWritten := 0 |
|
||||||
b.store[b.index] = x |
|
||||||
b.index += 1 |
|
||||||
if b.index == 4 { |
|
||||||
n, err := b.Flush() |
|
||||||
if err != nil { |
|
||||||
return n, err |
|
||||||
} |
|
||||||
bytesWritten += n |
|
||||||
b.index = 0 |
|
||||||
} |
|
||||||
return bytesWritten, nil |
|
||||||
} |
|
||||||
|
|
||||||
func (b *U32GroupVarintEncoder) Close() { |
|
||||||
// On Close, we flush any remaining values that might not have been in a full group
|
|
||||||
b.Flush() |
|
||||||
} |
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type U32GroupVarintDecoder struct { |
|
||||||
r io.ByteReader |
|
||||||
group [4]uint32 |
|
||||||
pos int |
|
||||||
finished bool |
|
||||||
capacity int |
|
||||||
} |
|
||||||
|
|
||||||
func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder { |
|
||||||
return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4} |
|
||||||
} |
|
||||||
|
|
||||||
func (b *U32GroupVarintDecoder) getGroup() error { |
|
||||||
// We should always receive a sizeByte if there are more values to read
|
|
||||||
sizeByte, err := b.r.ReadByte() |
|
||||||
if err != nil { |
|
||||||
return err |
|
||||||
} |
|
||||||
// Calculate the size of the four incoming 32 bit integers
|
|
||||||
// 0b00 means 1 byte to read, 0b01 = 2, etc
|
|
||||||
b.group[0] = uint32((sizeByte >> 6) & 3) |
|
||||||
b.group[1] = uint32((sizeByte >> 4) & 3) |
|
||||||
b.group[2] = uint32((sizeByte >> 2) & 3) |
|
||||||
b.group[3] = uint32(sizeByte & 3) |
|
||||||
//
|
|
||||||
for index, size := range b.group { |
|
||||||
b.group[index] = 0 |
|
||||||
// Any error that occurs in earlier byte reads should be repeated at the end one
|
|
||||||
// Hence we only catch and report the final ReadByte's error
|
|
||||||
var err error |
|
||||||
switch size { |
|
||||||
case 0: |
|
||||||
var x byte |
|
||||||
x, err = b.r.ReadByte() |
|
||||||
b.group[index] = uint32(x) |
|
||||||
case 1: |
|
||||||
var x, y byte |
|
||||||
x, _ = b.r.ReadByte() |
|
||||||
y, err = b.r.ReadByte() |
|
||||||
b.group[index] = uint32(x)<<8 | uint32(y) |
|
||||||
case 2: |
|
||||||
var x, y, z byte |
|
||||||
x, _ = b.r.ReadByte() |
|
||||||
y, _ = b.r.ReadByte() |
|
||||||
z, err = b.r.ReadByte() |
|
||||||
b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z) |
|
||||||
case 3: |
|
||||||
var x, y, z, zz byte |
|
||||||
x, _ = b.r.ReadByte() |
|
||||||
y, _ = b.r.ReadByte() |
|
||||||
z, _ = b.r.ReadByte() |
|
||||||
zz, err = b.r.ReadByte() |
|
||||||
b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz) |
|
||||||
} |
|
||||||
if err != nil { |
|
||||||
if err == io.EOF { |
|
||||||
// If we hit EOF here, we have found a partial group
|
|
||||||
// We've return any valid entries we have read and return EOF once we run out
|
|
||||||
b.capacity = index |
|
||||||
b.finished = true |
|
||||||
break |
|
||||||
} else { |
|
||||||
return err |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
// Reset the pos pointer to the beginning of the read values
|
|
||||||
b.pos = 0 |
|
||||||
return nil |
|
||||||
} |
|
||||||
|
|
||||||
func (b *U32GroupVarintDecoder) GetU32() (uint32, error) { |
|
||||||
// Check if we have any more values to give out - if not, let's get them
|
|
||||||
if b.pos == b.capacity { |
|
||||||
// If finished is set, there is nothing else to do
|
|
||||||
if b.finished { |
|
||||||
return 0, io.EOF |
|
||||||
} |
|
||||||
err := b.getGroup() |
|
||||||
if err != nil { |
|
||||||
return 0, err |
|
||||||
} |
|
||||||
} |
|
||||||
// Increment pointer and return the value stored at that point
|
|
||||||
b.pos += 1 |
|
||||||
return b.group[b.pos-1], nil |
|
||||||
} |
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type Base128Encoder struct { |
|
||||||
w io.Writer |
|
||||||
tmpBytes []byte |
|
||||||
} |
|
||||||
|
|
||||||
func NewU32Base128Encoder(w io.Writer) *Base128Encoder { |
|
||||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)} |
|
||||||
} |
|
||||||
func NewU64Base128Encoder(w io.Writer) *Base128Encoder { |
|
||||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)} |
|
||||||
} |
|
||||||
|
|
||||||
func (b *Base128Encoder) PutU32(x uint32) (int, error) { |
|
||||||
writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x)) |
|
||||||
return b.w.Write(b.tmpBytes[:writtenBytes]) |
|
||||||
} |
|
||||||
|
|
||||||
func (b *Base128Encoder) PutU64(x uint64) (int, error) { |
|
||||||
writtenBytes := binary.PutUvarint(b.tmpBytes, x) |
|
||||||
return b.w.Write(b.tmpBytes[:writtenBytes]) |
|
||||||
} |
|
||||||
|
|
||||||
func (b *Base128Encoder) Close() { |
|
||||||
} |
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type Base128Decoder struct { |
|
||||||
r io.ByteReader |
|
||||||
} |
|
||||||
|
|
||||||
func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } |
|
||||||
func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } |
|
||||||
|
|
||||||
func (b *Base128Decoder) GetU32() (uint32, error) { |
|
||||||
v, err := binary.ReadUvarint(b.r) |
|
||||||
return uint32(v), err |
|
||||||
} |
|
||||||
|
|
||||||
func (b *Base128Decoder) GetU64() (uint64, error) { |
|
||||||
return binary.ReadUvarint(b.r) |
|
||||||
} |
|
@ -0,0 +1,174 @@ |
|||||||
|
// The code here was obtained from:
|
||||||
|
// https://github.com/mmcloughlin/geohash
|
||||||
|
|
||||||
|
// The MIT License (MIT)
|
||||||
|
// Copyright (c) 2015 Michael McLoughlin
|
||||||
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
// of this software and associated documentation files (the "Software"), to deal
|
||||||
|
// in the Software without restriction, including without limitation the rights
|
||||||
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the Software is
|
||||||
|
// furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
// The above copyright notice and this permission notice shall be included in all
|
||||||
|
// copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
// SOFTWARE.
|
||||||
|
|
||||||
|
package geo |
||||||
|
|
||||||
|
import ( |
||||||
|
"math" |
||||||
|
) |
||||||
|
|
||||||
|
// encoding encapsulates an encoding defined by a given base32 alphabet.
|
||||||
|
type encoding struct { |
||||||
|
enc string |
||||||
|
dec [256]byte |
||||||
|
} |
||||||
|
|
||||||
|
// newEncoding constructs a new encoding defined by the given alphabet,
|
||||||
|
// which must be a 32-byte string.
|
||||||
|
func newEncoding(encoder string) *encoding { |
||||||
|
e := new(encoding) |
||||||
|
e.enc = encoder |
||||||
|
for i := 0; i < len(e.dec); i++ { |
||||||
|
e.dec[i] = 0xff |
||||||
|
} |
||||||
|
for i := 0; i < len(encoder); i++ { |
||||||
|
e.dec[encoder[i]] = byte(i) |
||||||
|
} |
||||||
|
return e |
||||||
|
} |
||||||
|
|
||||||
|
// Decode string into bits of a 64-bit word. The string s may be at most 12
|
||||||
|
// characters.
|
||||||
|
func (e *encoding) decode(s string) uint64 { |
||||||
|
x := uint64(0) |
||||||
|
for i := 0; i < len(s); i++ { |
||||||
|
x = (x << 5) | uint64(e.dec[s[i]]) |
||||||
|
} |
||||||
|
return x |
||||||
|
} |
||||||
|
|
||||||
|
// Encode bits of 64-bit word into a string.
|
||||||
|
func (e *encoding) encode(x uint64) string { |
||||||
|
b := [12]byte{} |
||||||
|
for i := 0; i < 12; i++ { |
||||||
|
b[11-i] = e.enc[x&0x1f] |
||||||
|
x >>= 5 |
||||||
|
} |
||||||
|
return string(b[:]) |
||||||
|
} |
||||||
|
|
||||||
|
// Base32Encoding with the Geohash alphabet.
|
||||||
|
var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz") |
||||||
|
|
||||||
|
// BoundingBox returns the region encoded by the given string geohash.
|
||||||
|
func geoBoundingBox(hash string) geoBox { |
||||||
|
bits := uint(5 * len(hash)) |
||||||
|
inthash := base32encoding.decode(hash) |
||||||
|
return geoBoundingBoxIntWithPrecision(inthash, bits) |
||||||
|
} |
||||||
|
|
||||||
|
// Box represents a rectangle in latitude/longitude space.
|
||||||
|
type geoBox struct { |
||||||
|
minLat float64 |
||||||
|
maxLat float64 |
||||||
|
minLng float64 |
||||||
|
maxLng float64 |
||||||
|
} |
||||||
|
|
||||||
|
// Round returns a point inside the box, making an effort to round to minimal
|
||||||
|
// precision.
|
||||||
|
func (b geoBox) round() (lat, lng float64) { |
||||||
|
x := maxDecimalPower(b.maxLat - b.minLat) |
||||||
|
lat = math.Ceil(b.minLat/x) * x |
||||||
|
x = maxDecimalPower(b.maxLng - b.minLng) |
||||||
|
lng = math.Ceil(b.minLng/x) * x |
||||||
|
return |
||||||
|
} |
||||||
|
|
||||||
|
// precalculated for performance
|
||||||
|
var exp232 = math.Exp2(32) |
||||||
|
|
||||||
|
// errorWithPrecision returns the error range in latitude and longitude for in
|
||||||
|
// integer geohash with bits of precision.
|
||||||
|
func errorWithPrecision(bits uint) (latErr, lngErr float64) { |
||||||
|
b := int(bits) |
||||||
|
latBits := b / 2 |
||||||
|
lngBits := b - latBits |
||||||
|
latErr = math.Ldexp(180.0, -latBits) |
||||||
|
lngErr = math.Ldexp(360.0, -lngBits) |
||||||
|
return |
||||||
|
} |
||||||
|
|
||||||
|
// minDecimalPlaces returns the minimum number of decimal places such that
|
||||||
|
// there must exist an number with that many places within any range of width
|
||||||
|
// r. This is intended for returning minimal precision coordinates inside a
|
||||||
|
// box.
|
||||||
|
func maxDecimalPower(r float64) float64 { |
||||||
|
m := int(math.Floor(math.Log10(r))) |
||||||
|
return math.Pow10(m) |
||||||
|
} |
||||||
|
|
||||||
|
// Encode the position of x within the range -r to +r as a 32-bit integer.
|
||||||
|
func encodeRange(x, r float64) uint32 { |
||||||
|
p := (x + r) / (2 * r) |
||||||
|
return uint32(p * exp232) |
||||||
|
} |
||||||
|
|
||||||
|
// Decode the 32-bit range encoding X back to a value in the range -r to +r.
|
||||||
|
func decodeRange(X uint32, r float64) float64 { |
||||||
|
p := float64(X) / exp232 |
||||||
|
x := 2*r*p - r |
||||||
|
return x |
||||||
|
} |
||||||
|
|
||||||
|
// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are
|
||||||
|
// ignored, and may take any value.
|
||||||
|
func squash(X uint64) uint32 { |
||||||
|
X &= 0x5555555555555555 |
||||||
|
X = (X | (X >> 1)) & 0x3333333333333333 |
||||||
|
X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f |
||||||
|
X = (X | (X >> 4)) & 0x00ff00ff00ff00ff |
||||||
|
X = (X | (X >> 8)) & 0x0000ffff0000ffff |
||||||
|
X = (X | (X >> 16)) & 0x00000000ffffffff |
||||||
|
return uint32(X) |
||||||
|
} |
||||||
|
|
||||||
|
// Deinterleave the bits of X into 32-bit words containing the even and odd
|
||||||
|
// bitlevels of X, respectively.
|
||||||
|
func deinterleave(X uint64) (uint32, uint32) { |
||||||
|
return squash(X), squash(X >> 1) |
||||||
|
} |
||||||
|
|
||||||
|
// BoundingBoxIntWithPrecision returns the region encoded by the integer
|
||||||
|
// geohash with the specified precision.
|
||||||
|
func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox { |
||||||
|
fullHash := hash << (64 - bits) |
||||||
|
latInt, lngInt := deinterleave(fullHash) |
||||||
|
lat := decodeRange(latInt, 90) |
||||||
|
lng := decodeRange(lngInt, 180) |
||||||
|
latErr, lngErr := errorWithPrecision(bits) |
||||||
|
return geoBox{ |
||||||
|
minLat: lat, |
||||||
|
maxLat: lat + latErr, |
||||||
|
minLng: lng, |
||||||
|
maxLng: lng + lngErr, |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// Decode the string geohash to a (lat, lng) point.
|
||||||
|
func GeoHashDecode(hash string) (lat, lng float64) { |
||||||
|
box := geoBoundingBox(hash) |
||||||
|
return box.round() |
||||||
|
} |
@ -0,0 +1,420 @@ |
|||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package scorch |
||||||
|
|
||||||
|
import ( |
||||||
|
"fmt" |
||||||
|
|
||||||
|
"github.com/RoaringBitmap/roaring" |
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/index" |
||||||
|
"github.com/blevesearch/bleve/index/scorch/segment" |
||||||
|
"github.com/blevesearch/bleve/index/scorch/segment/zap" |
||||||
|
) |
||||||
|
|
||||||
|
var OptimizeConjunction = true |
||||||
|
var OptimizeConjunctionUnadorned = true |
||||||
|
var OptimizeDisjunctionUnadorned = true |
||||||
|
|
||||||
|
func (s *IndexSnapshotTermFieldReader) Optimize(kind string, |
||||||
|
octx index.OptimizableContext) (index.OptimizableContext, error) { |
||||||
|
if OptimizeConjunction && kind == "conjunction" { |
||||||
|
return s.optimizeConjunction(octx) |
||||||
|
} |
||||||
|
|
||||||
|
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" { |
||||||
|
return s.optimizeConjunctionUnadorned(octx) |
||||||
|
} |
||||||
|
|
||||||
|
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" { |
||||||
|
return s.optimizeDisjunctionUnadorned(octx) |
||||||
|
} |
||||||
|
|
||||||
|
return octx, nil |
||||||
|
} |
||||||
|
|
||||||
|
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256) |
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
func (s *IndexSnapshotTermFieldReader) optimizeConjunction( |
||||||
|
octx index.OptimizableContext) (index.OptimizableContext, error) { |
||||||
|
if octx == nil { |
||||||
|
octx = &OptimizeTFRConjunction{snapshot: s.snapshot} |
||||||
|
} |
||||||
|
|
||||||
|
o, ok := octx.(*OptimizeTFRConjunction) |
||||||
|
if !ok { |
||||||
|
return octx, nil |
||||||
|
} |
||||||
|
|
||||||
|
if o.snapshot != s.snapshot { |
||||||
|
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots") |
||||||
|
} |
||||||
|
|
||||||
|
o.tfrs = append(o.tfrs, s) |
||||||
|
|
||||||
|
return o, nil |
||||||
|
} |
||||||
|
|
||||||
|
type OptimizeTFRConjunction struct { |
||||||
|
snapshot *IndexSnapshot |
||||||
|
|
||||||
|
tfrs []*IndexSnapshotTermFieldReader |
||||||
|
} |
||||||
|
|
||||||
|
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { |
||||||
|
if len(o.tfrs) <= 1 { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
for i := range o.snapshot.segment { |
||||||
|
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator) |
||||||
|
if !ok || itr0.ActualBM == nil { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator) |
||||||
|
if !ok || itr1.ActualBM == nil { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
bm := roaring.And(itr0.ActualBM, itr1.ActualBM) |
||||||
|
|
||||||
|
for _, tfr := range o.tfrs[2:] { |
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||||
|
if !ok || itr.ActualBM == nil { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
bm.And(itr.ActualBM) |
||||||
|
} |
||||||
|
|
||||||
|
// in this conjunction optimization, the postings iterators
|
||||||
|
// will all share the same AND'ed together actual bitmap. The
|
||||||
|
// regular conjunction searcher machinery will still be used,
|
||||||
|
// but the underlying bitmap will be smaller.
|
||||||
|
for _, tfr := range o.tfrs { |
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||||
|
if ok && itr.ActualBM != nil { |
||||||
|
itr.ActualBM = bm |
||||||
|
itr.Actual = bm.Iterator() |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
// An "unadorned" conjunction optimization is appropriate when
|
||||||
|
// additional or subsidiary information like freq-norm's and
|
||||||
|
// term-vectors are not required, and instead only the internal-id's
|
||||||
|
// are needed.
|
||||||
|
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned( |
||||||
|
octx index.OptimizableContext) (index.OptimizableContext, error) { |
||||||
|
if octx == nil { |
||||||
|
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot} |
||||||
|
} |
||||||
|
|
||||||
|
o, ok := octx.(*OptimizeTFRConjunctionUnadorned) |
||||||
|
if !ok { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
if o.snapshot != s.snapshot { |
||||||
|
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots") |
||||||
|
} |
||||||
|
|
||||||
|
o.tfrs = append(o.tfrs, s) |
||||||
|
|
||||||
|
return o, nil |
||||||
|
} |
||||||
|
|
||||||
|
type OptimizeTFRConjunctionUnadorned struct { |
||||||
|
snapshot *IndexSnapshot |
||||||
|
|
||||||
|
tfrs []*IndexSnapshotTermFieldReader |
||||||
|
} |
||||||
|
|
||||||
|
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>") |
||||||
|
var OptimizeTFRConjunctionUnadornedField = "*" |
||||||
|
|
||||||
|
// Finish of an unadorned conjunction optimization will compute a
|
||||||
|
// termFieldReader with an "actual" bitmap that represents the
|
||||||
|
// constituent bitmaps AND'ed together. This termFieldReader cannot
|
||||||
|
// provide any freq-norm or termVector associated information.
|
||||||
|
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) { |
||||||
|
if len(o.tfrs) <= 1 { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
// We use an artificial term and field because the optimized
|
||||||
|
// termFieldReader can represent multiple terms and fields.
|
||||||
|
oTFR := &IndexSnapshotTermFieldReader{ |
||||||
|
term: OptimizeTFRConjunctionUnadornedTerm, |
||||||
|
field: OptimizeTFRConjunctionUnadornedField, |
||||||
|
snapshot: o.snapshot, |
||||||
|
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), |
||||||
|
segmentOffset: 0, |
||||||
|
includeFreq: false, |
||||||
|
includeNorm: false, |
||||||
|
includeTermVectors: false, |
||||||
|
} |
||||||
|
|
||||||
|
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||||
|
|
||||||
|
OUTER: |
||||||
|
for i := range o.snapshot.segment { |
||||||
|
actualBMs = actualBMs[:0] |
||||||
|
|
||||||
|
var docNum1HitLast uint64 |
||||||
|
var docNum1HitLastOk bool |
||||||
|
|
||||||
|
for _, tfr := range o.tfrs { |
||||||
|
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok { |
||||||
|
// An empty postings iterator means the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||||
|
continue OUTER |
||||||
|
} |
||||||
|
|
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||||
|
if !ok { |
||||||
|
// We optimize zap postings iterators only.
|
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
// If the postings iterator is "1-hit" optimized, then we
|
||||||
|
// can perform several optimizations up-front here.
|
||||||
|
docNum1Hit, ok := itr.DocNum1Hit() |
||||||
|
if ok { |
||||||
|
if docNum1Hit == zap.DocNum1HitFinished { |
||||||
|
// An empty docNum here means the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||||
|
continue OUTER |
||||||
|
} |
||||||
|
|
||||||
|
if docNum1HitLastOk && docNum1HitLast != docNum1Hit { |
||||||
|
// The docNum1Hit doesn't match the previous
|
||||||
|
// docNum1HitLast, so the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||||
|
continue OUTER |
||||||
|
} |
||||||
|
|
||||||
|
docNum1HitLast = docNum1Hit |
||||||
|
docNum1HitLastOk = true |
||||||
|
|
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
if itr.ActualBM == nil { |
||||||
|
// An empty actual bitmap means the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||||
|
continue OUTER |
||||||
|
} |
||||||
|
|
||||||
|
// Collect the actual bitmap for more processing later.
|
||||||
|
actualBMs = append(actualBMs, itr.ActualBM) |
||||||
|
} |
||||||
|
|
||||||
|
if docNum1HitLastOk { |
||||||
|
// We reach here if all the 1-hit optimized posting
|
||||||
|
// iterators had the same 1-hit docNum, so we can check if
|
||||||
|
// our collected actual bitmaps also have that docNum.
|
||||||
|
for _, bm := range actualBMs { |
||||||
|
if !bm.Contains(uint32(docNum1HitLast)) { |
||||||
|
// The docNum1Hit isn't in one of our actual
|
||||||
|
// bitmaps, so the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||||
|
continue OUTER |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// The actual bitmaps and docNum1Hits all contain or have
|
||||||
|
// the same 1-hit docNum, so that's our AND'ed result.
|
||||||
|
oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit( |
||||||
|
docNum1HitLast, zap.NormBits1Hit, false, false) |
||||||
|
if err != nil { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
continue OUTER |
||||||
|
} |
||||||
|
|
||||||
|
if len(actualBMs) == 0 { |
||||||
|
// If we've collected no actual bitmaps at this point,
|
||||||
|
// then the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator |
||||||
|
continue OUTER |
||||||
|
} |
||||||
|
|
||||||
|
if len(actualBMs) == 1 { |
||||||
|
// If we've only 1 actual bitmap, then that's our result.
|
||||||
|
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( |
||||||
|
actualBMs[0], false, false) |
||||||
|
if err != nil { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
continue OUTER |
||||||
|
} |
||||||
|
|
||||||
|
// Else, AND together our collected bitmaps as our result.
|
||||||
|
bm := roaring.And(actualBMs[0], actualBMs[1]) |
||||||
|
|
||||||
|
for _, actualBM := range actualBMs[2:] { |
||||||
|
bm.And(actualBM) |
||||||
|
} |
||||||
|
|
||||||
|
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( |
||||||
|
bm, false, false) |
||||||
|
if err != nil { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return oTFR, nil |
||||||
|
} |
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
// An "unadorned" disjunction optimization is appropriate when
|
||||||
|
// additional or subsidiary information like freq-norm's and
|
||||||
|
// term-vectors are not required, and instead only the internal-id's
|
||||||
|
// are needed.
|
||||||
|
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( |
||||||
|
octx index.OptimizableContext) (index.OptimizableContext, error) { |
||||||
|
if octx == nil { |
||||||
|
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot} |
||||||
|
} |
||||||
|
|
||||||
|
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned) |
||||||
|
if !ok { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
if o.snapshot != s.snapshot { |
||||||
|
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots") |
||||||
|
} |
||||||
|
|
||||||
|
o.tfrs = append(o.tfrs, s) |
||||||
|
|
||||||
|
return o, nil |
||||||
|
} |
||||||
|
|
||||||
|
type OptimizeTFRDisjunctionUnadorned struct { |
||||||
|
snapshot *IndexSnapshot |
||||||
|
|
||||||
|
tfrs []*IndexSnapshotTermFieldReader |
||||||
|
} |
||||||
|
|
||||||
|
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>") |
||||||
|
var OptimizeTFRDisjunctionUnadornedField = "*" |
||||||
|
|
||||||
|
// Finish of an unadorned disjunction optimization will compute a
|
||||||
|
// termFieldReader with an "actual" bitmap that represents the
|
||||||
|
// constituent bitmaps OR'ed together. This termFieldReader cannot
|
||||||
|
// provide any freq-norm or termVector associated information.
|
||||||
|
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) { |
||||||
|
if len(o.tfrs) <= 1 { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
for i := range o.snapshot.segment { |
||||||
|
var cMax uint64 |
||||||
|
|
||||||
|
for _, tfr := range o.tfrs { |
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||||
|
if !ok { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
if itr.ActualBM != nil { |
||||||
|
c := itr.ActualBM.GetCardinality() |
||||||
|
if cMax < c { |
||||||
|
cMax = c |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Heuristic to skip the optimization if all the constituent
|
||||||
|
// bitmaps are too small, where the processing & resource
|
||||||
|
// overhead to create the OR'ed bitmap outweighs the benefit.
|
||||||
|
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// We use an artificial term and field because the optimized
|
||||||
|
// termFieldReader can represent multiple terms and fields.
|
||||||
|
oTFR := &IndexSnapshotTermFieldReader{ |
||||||
|
term: OptimizeTFRDisjunctionUnadornedTerm, |
||||||
|
field: OptimizeTFRDisjunctionUnadornedField, |
||||||
|
snapshot: o.snapshot, |
||||||
|
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), |
||||||
|
segmentOffset: 0, |
||||||
|
includeFreq: false, |
||||||
|
includeNorm: false, |
||||||
|
includeTermVectors: false, |
||||||
|
} |
||||||
|
|
||||||
|
var docNums []uint32 // Collected docNum's from 1-hit posting lists.
|
||||||
|
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||||
|
|
||||||
|
for i := range o.snapshot.segment { |
||||||
|
docNums = docNums[:0] |
||||||
|
actualBMs = actualBMs[:0] |
||||||
|
|
||||||
|
for _, tfr := range o.tfrs { |
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) |
||||||
|
if !ok { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
docNum, ok := itr.DocNum1Hit() |
||||||
|
if ok { |
||||||
|
docNums = append(docNums, uint32(docNum)) |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
if itr.ActualBM != nil { |
||||||
|
actualBMs = append(actualBMs, itr.ActualBM) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
var bm *roaring.Bitmap |
||||||
|
if len(actualBMs) > 2 { |
||||||
|
bm = roaring.HeapOr(actualBMs...) |
||||||
|
} else if len(actualBMs) == 2 { |
||||||
|
bm = roaring.Or(actualBMs[0], actualBMs[1]) |
||||||
|
} else if len(actualBMs) == 1 { |
||||||
|
bm = actualBMs[0].Clone() |
||||||
|
} |
||||||
|
|
||||||
|
if bm == nil { |
||||||
|
bm = roaring.New() |
||||||
|
} |
||||||
|
|
||||||
|
bm.AddMany(docNums) |
||||||
|
|
||||||
|
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false) |
||||||
|
if err != nil { |
||||||
|
return nil, nil |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return oTFR, nil |
||||||
|
} |
@ -1,110 +0,0 @@ |
|||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package scorch |
|
||||||
|
|
||||||
import ( |
|
||||||
"github.com/blevesearch/bleve/document" |
|
||||||
"github.com/blevesearch/bleve/index" |
|
||||||
) |
|
||||||
|
|
||||||
type Reader struct { |
|
||||||
root *IndexSnapshot // Owns 1 ref-count on the index snapshot.
|
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) TermFieldReader(term []byte, field string, includeFreq, |
|
||||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { |
|
||||||
return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors) |
|
||||||
} |
|
||||||
|
|
||||||
// DocIDReader returns an iterator over all doc ids
|
|
||||||
// The caller must close returned instance to release associated resources.
|
|
||||||
func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) { |
|
||||||
return r.root.DocIDReaderAll() |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { |
|
||||||
return r.root.DocIDReaderOnly(ids) |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) FieldDict(field string) (index.FieldDict, error) { |
|
||||||
return r.root.FieldDict(field) |
|
||||||
} |
|
||||||
|
|
||||||
// FieldDictRange is currently defined to include the start and end terms
|
|
||||||
func (r *Reader) FieldDictRange(field string, startTerm []byte, |
|
||||||
endTerm []byte) (index.FieldDict, error) { |
|
||||||
return r.root.FieldDictRange(field, startTerm, endTerm) |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) FieldDictPrefix(field string, |
|
||||||
termPrefix []byte) (index.FieldDict, error) { |
|
||||||
return r.root.FieldDictPrefix(field, termPrefix) |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) Document(id string) (*document.Document, error) { |
|
||||||
return r.root.Document(id) |
|
||||||
} |
|
||||||
func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, |
|
||||||
visitor index.DocumentFieldTermVisitor) error { |
|
||||||
return r.root.DocumentVisitFieldTerms(id, fields, visitor) |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) Fields() ([]string, error) { |
|
||||||
return r.root.Fields() |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) GetInternal(key []byte) ([]byte, error) { |
|
||||||
return r.root.GetInternal(key) |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) DocCount() (uint64, error) { |
|
||||||
return r.root.DocCount() |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) { |
|
||||||
return r.root.ExternalID(id) |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { |
|
||||||
return r.root.InternalID(id) |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) DumpAll() chan interface{} { |
|
||||||
rv := make(chan interface{}) |
|
||||||
go func() { |
|
||||||
close(rv) |
|
||||||
}() |
|
||||||
return rv |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) DumpDoc(id string) chan interface{} { |
|
||||||
rv := make(chan interface{}) |
|
||||||
go func() { |
|
||||||
close(rv) |
|
||||||
}() |
|
||||||
return rv |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) DumpFields() chan interface{} { |
|
||||||
rv := make(chan interface{}) |
|
||||||
go func() { |
|
||||||
close(rv) |
|
||||||
}() |
|
||||||
return rv |
|
||||||
} |
|
||||||
|
|
||||||
func (r *Reader) Close() error { |
|
||||||
return r.root.DecRef() |
|
||||||
} |
|
@ -1,321 +0,0 @@ |
|||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package mem |
|
||||||
|
|
||||||
import ( |
|
||||||
"math" |
|
||||||
"sort" |
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring" |
|
||||||
"github.com/blevesearch/bleve/analysis" |
|
||||||
"github.com/blevesearch/bleve/document" |
|
||||||
"github.com/blevesearch/bleve/index" |
|
||||||
) |
|
||||||
|
|
||||||
// NewFromAnalyzedDocs places the analyzed document mutations into a new segment
|
|
||||||
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { |
|
||||||
s := New() |
|
||||||
|
|
||||||
// ensure that _id field get fieldID 0
|
|
||||||
s.getOrDefineField("_id") |
|
||||||
|
|
||||||
// fill Dicts/DictKeys and preallocate memory
|
|
||||||
s.initializeDict(results) |
|
||||||
|
|
||||||
// walk each doc
|
|
||||||
for _, result := range results { |
|
||||||
s.processDocument(result) |
|
||||||
} |
|
||||||
|
|
||||||
// go back and sort the dictKeys
|
|
||||||
for _, dict := range s.DictKeys { |
|
||||||
sort.Strings(dict) |
|
||||||
} |
|
||||||
|
|
||||||
// compute memory usage of segment
|
|
||||||
s.updateSizeInBytes() |
|
||||||
|
|
||||||
// professional debugging
|
|
||||||
//
|
|
||||||
// log.Printf("fields: %v\n", s.FieldsMap)
|
|
||||||
// log.Printf("fieldsInv: %v\n", s.FieldsInv)
|
|
||||||
// log.Printf("fieldsLoc: %v\n", s.FieldsLoc)
|
|
||||||
// log.Printf("dicts: %v\n", s.Dicts)
|
|
||||||
// log.Printf("dict keys: %v\n", s.DictKeys)
|
|
||||||
// for i, posting := range s.Postings {
|
|
||||||
// log.Printf("posting %d: %v\n", i, posting)
|
|
||||||
// }
|
|
||||||
// for i, freq := range s.Freqs {
|
|
||||||
// log.Printf("freq %d: %v\n", i, freq)
|
|
||||||
// }
|
|
||||||
// for i, norm := range s.Norms {
|
|
||||||
// log.Printf("norm %d: %v\n", i, norm)
|
|
||||||
// }
|
|
||||||
// for i, field := range s.Locfields {
|
|
||||||
// log.Printf("field %d: %v\n", i, field)
|
|
||||||
// }
|
|
||||||
// for i, start := range s.Locstarts {
|
|
||||||
// log.Printf("start %d: %v\n", i, start)
|
|
||||||
// }
|
|
||||||
// for i, end := range s.Locends {
|
|
||||||
// log.Printf("end %d: %v\n", i, end)
|
|
||||||
// }
|
|
||||||
// for i, pos := range s.Locpos {
|
|
||||||
// log.Printf("pos %d: %v\n", i, pos)
|
|
||||||
// }
|
|
||||||
// for i, apos := range s.Locarraypos {
|
|
||||||
// log.Printf("apos %d: %v\n", i, apos)
|
|
||||||
// }
|
|
||||||
// log.Printf("stored: %v\n", s.Stored)
|
|
||||||
// log.Printf("stored types: %v\n", s.StoredTypes)
|
|
||||||
// log.Printf("stored pos: %v\n", s.StoredPos)
|
|
||||||
|
|
||||||
return s |
|
||||||
} |
|
||||||
|
|
||||||
// fill Dicts/DictKeys and preallocate memory for postings
|
|
||||||
func (s *Segment) initializeDict(results []*index.AnalysisResult) { |
|
||||||
var numPostingsLists int |
|
||||||
|
|
||||||
numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
|
||||||
numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
|
||||||
|
|
||||||
var numTokenFrequencies int |
|
||||||
var totLocs int |
|
||||||
|
|
||||||
// initial scan for all fieldID's to sort them
|
|
||||||
for _, result := range results { |
|
||||||
for _, field := range result.Document.CompositeFields { |
|
||||||
s.getOrDefineField(field.Name()) |
|
||||||
} |
|
||||||
for _, field := range result.Document.Fields { |
|
||||||
s.getOrDefineField(field.Name()) |
|
||||||
} |
|
||||||
} |
|
||||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
|
||||||
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) |
|
||||||
for fieldID, fieldName := range s.FieldsInv { |
|
||||||
s.FieldsMap[fieldName] = uint16(fieldID + 1) |
|
||||||
} |
|
||||||
|
|
||||||
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { |
|
||||||
for term, tf := range tfs { |
|
||||||
pidPlus1, exists := s.Dicts[fieldID][term] |
|
||||||
if !exists { |
|
||||||
numPostingsLists++ |
|
||||||
pidPlus1 = uint64(numPostingsLists) |
|
||||||
s.Dicts[fieldID][term] = pidPlus1 |
|
||||||
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) |
|
||||||
numTermsPerPostingsList = append(numTermsPerPostingsList, 0) |
|
||||||
numLocsPerPostingsList = append(numLocsPerPostingsList, 0) |
|
||||||
} |
|
||||||
pid := pidPlus1 - 1 |
|
||||||
numTermsPerPostingsList[pid] += 1 |
|
||||||
numLocsPerPostingsList[pid] += len(tf.Locations) |
|
||||||
totLocs += len(tf.Locations) |
|
||||||
} |
|
||||||
numTokenFrequencies += len(tfs) |
|
||||||
} |
|
||||||
|
|
||||||
for _, result := range results { |
|
||||||
// walk each composite field
|
|
||||||
for _, field := range result.Document.CompositeFields { |
|
||||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
|
||||||
_, tf := field.Analyze() |
|
||||||
processField(fieldID, tf) |
|
||||||
} |
|
||||||
|
|
||||||
// walk each field
|
|
||||||
for i, field := range result.Document.Fields { |
|
||||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
|
||||||
tf := result.Analyzed[i] |
|
||||||
processField(fieldID, tf) |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
s.Postings = make([]*roaring.Bitmap, numPostingsLists) |
|
||||||
for i := 0; i < numPostingsLists; i++ { |
|
||||||
s.Postings[i] = roaring.New() |
|
||||||
} |
|
||||||
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) |
|
||||||
for i := 0; i < numPostingsLists; i++ { |
|
||||||
s.PostingsLocs[i] = roaring.New() |
|
||||||
} |
|
||||||
|
|
||||||
// Preallocate big, contiguous backing arrays.
|
|
||||||
auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos.
|
|
||||||
uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos.
|
|
||||||
float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms.
|
|
||||||
uint16Backing := make([]uint16, totLocs) // For sub-Locfields.
|
|
||||||
|
|
||||||
// Point top-level slices to the backing arrays.
|
|
||||||
s.Freqs = auint64Backing[0:numPostingsLists] |
|
||||||
auint64Backing = auint64Backing[numPostingsLists:] |
|
||||||
|
|
||||||
s.Norms = make([][]float32, numPostingsLists) |
|
||||||
|
|
||||||
s.Locfields = make([][]uint16, numPostingsLists) |
|
||||||
|
|
||||||
s.Locstarts = auint64Backing[0:numPostingsLists] |
|
||||||
auint64Backing = auint64Backing[numPostingsLists:] |
|
||||||
|
|
||||||
s.Locends = auint64Backing[0:numPostingsLists] |
|
||||||
auint64Backing = auint64Backing[numPostingsLists:] |
|
||||||
|
|
||||||
s.Locpos = auint64Backing[0:numPostingsLists] |
|
||||||
auint64Backing = auint64Backing[numPostingsLists:] |
|
||||||
|
|
||||||
s.Locarraypos = make([][][]uint64, numPostingsLists) |
|
||||||
|
|
||||||
// Point sub-slices to the backing arrays.
|
|
||||||
for pid, numTerms := range numTermsPerPostingsList { |
|
||||||
s.Freqs[pid] = uint64Backing[0:0] |
|
||||||
uint64Backing = uint64Backing[numTerms:] |
|
||||||
|
|
||||||
s.Norms[pid] = float32Backing[0:0] |
|
||||||
float32Backing = float32Backing[numTerms:] |
|
||||||
} |
|
||||||
|
|
||||||
for pid, numLocs := range numLocsPerPostingsList { |
|
||||||
s.Locfields[pid] = uint16Backing[0:0] |
|
||||||
uint16Backing = uint16Backing[numLocs:] |
|
||||||
|
|
||||||
s.Locstarts[pid] = uint64Backing[0:0] |
|
||||||
uint64Backing = uint64Backing[numLocs:] |
|
||||||
|
|
||||||
s.Locends[pid] = uint64Backing[0:0] |
|
||||||
uint64Backing = uint64Backing[numLocs:] |
|
||||||
|
|
||||||
s.Locpos[pid] = uint64Backing[0:0] |
|
||||||
uint64Backing = uint64Backing[numLocs:] |
|
||||||
|
|
||||||
s.Locarraypos[pid] = auint64Backing[0:0] |
|
||||||
auint64Backing = auint64Backing[numLocs:] |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
func (s *Segment) processDocument(result *index.AnalysisResult) { |
|
||||||
// used to collate information across fields
|
|
||||||
docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) |
|
||||||
fieldLens := make(map[uint16]int, len(s.FieldsMap)) |
|
||||||
|
|
||||||
docNum := uint64(s.addDocument()) |
|
||||||
|
|
||||||
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { |
|
||||||
fieldLens[field] += l |
|
||||||
if existingFreqs, ok := docMap[field]; ok { |
|
||||||
existingFreqs.MergeAll(name, tf) |
|
||||||
} else { |
|
||||||
docMap[field] = tf |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { |
|
||||||
s.Stored[docNum][field] = append(s.Stored[docNum][field], val) |
|
||||||
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ) |
|
||||||
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos) |
|
||||||
} |
|
||||||
|
|
||||||
// walk each composite field
|
|
||||||
for _, field := range result.Document.CompositeFields { |
|
||||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
|
||||||
l, tf := field.Analyze() |
|
||||||
processField(fieldID, field.Name(), l, tf) |
|
||||||
} |
|
||||||
|
|
||||||
// walk each field
|
|
||||||
for i, field := range result.Document.Fields { |
|
||||||
fieldID := uint16(s.getOrDefineField(field.Name())) |
|
||||||
l := result.Length[i] |
|
||||||
tf := result.Analyzed[i] |
|
||||||
processField(fieldID, field.Name(), l, tf) |
|
||||||
if field.Options().IsStored() { |
|
||||||
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) |
|
||||||
} |
|
||||||
|
|
||||||
if field.Options().IncludeDocValues() { |
|
||||||
s.DocValueFields[fieldID] = true |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// now that its been rolled up into docMap, walk that
|
|
||||||
for fieldID, tokenFrequencies := range docMap { |
|
||||||
for term, tokenFreq := range tokenFrequencies { |
|
||||||
pid := s.Dicts[fieldID][term] - 1 |
|
||||||
bs := s.Postings[pid] |
|
||||||
bs.AddInt(int(docNum)) |
|
||||||
s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) |
|
||||||
s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) |
|
||||||
locationBS := s.PostingsLocs[pid] |
|
||||||
if len(tokenFreq.Locations) > 0 { |
|
||||||
locationBS.AddInt(int(docNum)) |
|
||||||
for _, loc := range tokenFreq.Locations { |
|
||||||
var locf = fieldID |
|
||||||
if loc.Field != "" { |
|
||||||
locf = uint16(s.getOrDefineField(loc.Field)) |
|
||||||
} |
|
||||||
s.Locfields[pid] = append(s.Locfields[pid], locf) |
|
||||||
s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) |
|
||||||
s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) |
|
||||||
s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) |
|
||||||
if len(loc.ArrayPositions) > 0 { |
|
||||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) |
|
||||||
} else { |
|
||||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
func (s *Segment) getOrDefineField(name string) int { |
|
||||||
fieldIDPlus1, ok := s.FieldsMap[name] |
|
||||||
if !ok { |
|
||||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) |
|
||||||
s.FieldsMap[name] = fieldIDPlus1 |
|
||||||
s.FieldsInv = append(s.FieldsInv, name) |
|
||||||
s.Dicts = append(s.Dicts, make(map[string]uint64)) |
|
||||||
s.DictKeys = append(s.DictKeys, make([]string, 0)) |
|
||||||
} |
|
||||||
return int(fieldIDPlus1 - 1) |
|
||||||
} |
|
||||||
|
|
||||||
func (s *Segment) addDocument() int { |
|
||||||
docNum := len(s.Stored) |
|
||||||
s.Stored = append(s.Stored, map[uint16][][]byte{}) |
|
||||||
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{}) |
|
||||||
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{}) |
|
||||||
return docNum |
|
||||||
} |
|
||||||
|
|
||||||
func encodeFieldType(f document.Field) byte { |
|
||||||
fieldType := byte('x') |
|
||||||
switch f.(type) { |
|
||||||
case *document.TextField: |
|
||||||
fieldType = 't' |
|
||||||
case *document.NumericField: |
|
||||||
fieldType = 'n' |
|
||||||
case *document.DateTimeField: |
|
||||||
fieldType = 'd' |
|
||||||
case *document.BooleanField: |
|
||||||
fieldType = 'b' |
|
||||||
case *document.GeoPointField: |
|
||||||
fieldType = 'g' |
|
||||||
case *document.CompositeField: |
|
||||||
fieldType = 'c' |
|
||||||
} |
|
||||||
return fieldType |
|
||||||
} |
|
@ -1,103 +0,0 @@ |
|||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package mem |
|
||||||
|
|
||||||
import ( |
|
||||||
"sort" |
|
||||||
"strings" |
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring" |
|
||||||
"github.com/blevesearch/bleve/index" |
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment" |
|
||||||
) |
|
||||||
|
|
||||||
// Dictionary is the in-memory representation of the term dictionary
|
|
||||||
type Dictionary struct { |
|
||||||
segment *Segment |
|
||||||
field string |
|
||||||
fieldID uint16 |
|
||||||
} |
|
||||||
|
|
||||||
// PostingsList returns the postings list for the specified term
|
|
||||||
func (d *Dictionary) PostingsList(term string, |
|
||||||
except *roaring.Bitmap) (segment.PostingsList, error) { |
|
||||||
return &PostingsList{ |
|
||||||
dictionary: d, |
|
||||||
term: term, |
|
||||||
postingsID: d.segment.Dicts[d.fieldID][term], |
|
||||||
except: except, |
|
||||||
}, nil |
|
||||||
} |
|
||||||
|
|
||||||
// Iterator returns an iterator for this dictionary
|
|
||||||
func (d *Dictionary) Iterator() segment.DictionaryIterator { |
|
||||||
return &DictionaryIterator{ |
|
||||||
d: d, |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// PrefixIterator returns an iterator which only visits terms having the
|
|
||||||
// the specified prefix
|
|
||||||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { |
|
||||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix) |
|
||||||
return &DictionaryIterator{ |
|
||||||
d: d, |
|
||||||
prefix: prefix, |
|
||||||
offset: offset, |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// RangeIterator returns an iterator which only visits terms between the
|
|
||||||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
|
|
||||||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { |
|
||||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start) |
|
||||||
return &DictionaryIterator{ |
|
||||||
d: d, |
|
||||||
offset: offset, |
|
||||||
end: end, |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// DictionaryIterator is an iterator for term dictionary
|
|
||||||
type DictionaryIterator struct { |
|
||||||
d *Dictionary |
|
||||||
prefix string |
|
||||||
end string |
|
||||||
offset int |
|
||||||
|
|
||||||
dictEntry index.DictEntry // reused across Next()'s
|
|
||||||
} |
|
||||||
|
|
||||||
// Next returns the next entry in the dictionary
|
|
||||||
func (d *DictionaryIterator) Next() (*index.DictEntry, error) { |
|
||||||
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { |
|
||||||
return nil, nil |
|
||||||
} |
|
||||||
next := d.d.segment.DictKeys[d.d.fieldID][d.offset] |
|
||||||
// check prefix
|
|
||||||
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) { |
|
||||||
return nil, nil |
|
||||||
} |
|
||||||
// check end (bleve.index API demands inclusive end)
|
|
||||||
if d.end != "" && next > d.end { |
|
||||||
return nil, nil |
|
||||||
} |
|
||||||
|
|
||||||
d.offset++ |
|
||||||
postingID := d.d.segment.Dicts[d.d.fieldID][next] |
|
||||||
d.dictEntry.Term = next |
|
||||||
d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality() |
|
||||||
return &d.dictEntry, nil |
|
||||||
} |
|
@ -1,178 +0,0 @@ |
|||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package mem |
|
||||||
|
|
||||||
import ( |
|
||||||
"github.com/RoaringBitmap/roaring" |
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment" |
|
||||||
) |
|
||||||
|
|
||||||
// PostingsList is an in-memory represenation of a postings list
|
|
||||||
type PostingsList struct { |
|
||||||
dictionary *Dictionary |
|
||||||
term string |
|
||||||
postingsID uint64 |
|
||||||
except *roaring.Bitmap |
|
||||||
} |
|
||||||
|
|
||||||
// Count returns the number of items on this postings list
|
|
||||||
func (p *PostingsList) Count() uint64 { |
|
||||||
var rv uint64 |
|
||||||
if p.postingsID > 0 { |
|
||||||
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality() |
|
||||||
if p.except != nil { |
|
||||||
except := p.except.GetCardinality() |
|
||||||
if except > rv { |
|
||||||
// avoid underflow
|
|
||||||
except = rv |
|
||||||
} |
|
||||||
rv -= except |
|
||||||
} |
|
||||||
} |
|
||||||
return rv |
|
||||||
} |
|
||||||
|
|
||||||
// Iterator returns an iterator for this postings list
|
|
||||||
func (p *PostingsList) Iterator() segment.PostingsIterator { |
|
||||||
rv := &PostingsIterator{ |
|
||||||
postings: p, |
|
||||||
} |
|
||||||
if p.postingsID > 0 { |
|
||||||
allbits := p.dictionary.segment.Postings[p.postingsID-1] |
|
||||||
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] |
|
||||||
rv.all = allbits.Iterator() |
|
||||||
if p.except != nil { |
|
||||||
allExcept := allbits.Clone() |
|
||||||
allExcept.AndNot(p.except) |
|
||||||
rv.actual = allExcept.Iterator() |
|
||||||
} else { |
|
||||||
rv.actual = allbits.Iterator() |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
return rv |
|
||||||
} |
|
||||||
|
|
||||||
// PostingsIterator provides a way to iterate through the postings list
|
|
||||||
type PostingsIterator struct { |
|
||||||
postings *PostingsList |
|
||||||
all roaring.IntIterable |
|
||||||
locations *roaring.Bitmap |
|
||||||
offset int |
|
||||||
locoffset int |
|
||||||
actual roaring.IntIterable |
|
||||||
} |
|
||||||
|
|
||||||
// Next returns the next posting on the postings list, or nil at the end
|
|
||||||
func (i *PostingsIterator) Next() (segment.Posting, error) { |
|
||||||
if i.actual == nil || !i.actual.HasNext() { |
|
||||||
return nil, nil |
|
||||||
} |
|
||||||
n := i.actual.Next() |
|
||||||
allN := i.all.Next() |
|
||||||
|
|
||||||
// n is the next actual hit (excluding some postings)
|
|
||||||
// allN is the next hit in the full postings
|
|
||||||
// if they don't match, adjust offsets to factor in item we're skipping over
|
|
||||||
// incr the all iterator, and check again
|
|
||||||
for allN != n { |
|
||||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) |
|
||||||
i.offset++ |
|
||||||
allN = i.all.Next() |
|
||||||
} |
|
||||||
rv := &Posting{ |
|
||||||
iterator: i, |
|
||||||
docNum: uint64(n), |
|
||||||
offset: i.offset, |
|
||||||
locoffset: i.locoffset, |
|
||||||
hasLoc: i.locations.Contains(n), |
|
||||||
} |
|
||||||
|
|
||||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) |
|
||||||
i.offset++ |
|
||||||
return rv, nil |
|
||||||
} |
|
||||||
|
|
||||||
// Posting is a single entry in a postings list
|
|
||||||
type Posting struct { |
|
||||||
iterator *PostingsIterator |
|
||||||
docNum uint64 |
|
||||||
offset int |
|
||||||
locoffset int |
|
||||||
hasLoc bool |
|
||||||
} |
|
||||||
|
|
||||||
// Number returns the document number of this posting in this segment
|
|
||||||
func (p *Posting) Number() uint64 { |
|
||||||
return p.docNum |
|
||||||
} |
|
||||||
|
|
||||||
// Frequency returns the frequence of occurance of this term in this doc/field
|
|
||||||
func (p *Posting) Frequency() uint64 { |
|
||||||
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset] |
|
||||||
} |
|
||||||
|
|
||||||
// Norm returns the normalization factor for this posting
|
|
||||||
func (p *Posting) Norm() float64 { |
|
||||||
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset]) |
|
||||||
} |
|
||||||
|
|
||||||
// Locations returns the location information for each occurance
|
|
||||||
func (p *Posting) Locations() []segment.Location { |
|
||||||
if !p.hasLoc { |
|
||||||
return nil |
|
||||||
} |
|
||||||
freq := int(p.Frequency()) |
|
||||||
rv := make([]segment.Location, freq) |
|
||||||
for i := 0; i < freq; i++ { |
|
||||||
rv[i] = &Location{ |
|
||||||
p: p, |
|
||||||
offset: p.locoffset + i, |
|
||||||
} |
|
||||||
} |
|
||||||
return rv |
|
||||||
} |
|
||||||
|
|
||||||
// Location represents the location of a single occurance
|
|
||||||
type Location struct { |
|
||||||
p *Posting |
|
||||||
offset int |
|
||||||
} |
|
||||||
|
|
||||||
// Field returns the name of the field (useful in composite fields to know
|
|
||||||
// which original field the value came from)
|
|
||||||
func (l *Location) Field() string { |
|
||||||
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]] |
|
||||||
} |
|
||||||
|
|
||||||
// Start returns the start byte offset of this occurance
|
|
||||||
func (l *Location) Start() uint64 { |
|
||||||
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset] |
|
||||||
} |
|
||||||
|
|
||||||
// End returns the end byte offset of this occurance
|
|
||||||
func (l *Location) End() uint64 { |
|
||||||
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset] |
|
||||||
} |
|
||||||
|
|
||||||
// Pos returns the 1-based phrase position of this occurance
|
|
||||||
func (l *Location) Pos() uint64 { |
|
||||||
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset] |
|
||||||
} |
|
||||||
|
|
||||||
// ArrayPositions returns the array position vector associated with this occurance
|
|
||||||
func (l *Location) ArrayPositions() []uint64 { |
|
||||||
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset] |
|
||||||
} |
|
@ -1,289 +0,0 @@ |
|||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package mem |
|
||||||
|
|
||||||
import ( |
|
||||||
"fmt" |
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring" |
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment" |
|
||||||
) |
|
||||||
|
|
||||||
// _id field is always guaranteed to have fieldID of 0
|
|
||||||
const idFieldID uint16 = 0 |
|
||||||
|
|
||||||
// KNOWN ISSUES
|
|
||||||
// - LIMITATION - we decided whether or not to store term vectors for a field
|
|
||||||
// at the segment level, based on the first definition of a
|
|
||||||
// field we see. in normal bleve usage this is fine, all
|
|
||||||
// instances of a field definition will be the same. however,
|
|
||||||
// advanced users may violate this and provide unique field
|
|
||||||
// definitions with each document. this segment does not
|
|
||||||
// support this usage.
|
|
||||||
|
|
||||||
// TODO
|
|
||||||
// - need better testing of multiple docs, iterating freqs, locations and
|
|
||||||
// and verifying the correct results are returned
|
|
||||||
|
|
||||||
// Segment is an in memory implementation of scorch.Segment
|
|
||||||
type Segment struct { |
|
||||||
|
|
||||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
|
||||||
// name -> field id + 1
|
|
||||||
FieldsMap map[string]uint16 |
|
||||||
|
|
||||||
// FieldsInv is the inverse of FieldsMap
|
|
||||||
// field id -> name
|
|
||||||
FieldsInv []string |
|
||||||
|
|
||||||
// Term dictionaries for each field
|
|
||||||
// field id -> term -> postings list id + 1
|
|
||||||
Dicts []map[string]uint64 |
|
||||||
|
|
||||||
// Terms for each field, where terms are sorted ascending
|
|
||||||
// field id -> []term
|
|
||||||
DictKeys [][]string |
|
||||||
|
|
||||||
// Postings list
|
|
||||||
// postings list id -> bitmap by docNum
|
|
||||||
Postings []*roaring.Bitmap |
|
||||||
|
|
||||||
// Postings list has locations
|
|
||||||
PostingsLocs []*roaring.Bitmap |
|
||||||
|
|
||||||
// Term frequencies
|
|
||||||
// postings list id -> Freqs (one for each hit in bitmap)
|
|
||||||
Freqs [][]uint64 |
|
||||||
|
|
||||||
// Field norms
|
|
||||||
// postings list id -> Norms (one for each hit in bitmap)
|
|
||||||
Norms [][]float32 |
|
||||||
|
|
||||||
// Field/start/end/pos/locarraypos
|
|
||||||
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
|
||||||
Locfields [][]uint16 |
|
||||||
Locstarts [][]uint64 |
|
||||||
Locends [][]uint64 |
|
||||||
Locpos [][]uint64 |
|
||||||
Locarraypos [][][]uint64 |
|
||||||
|
|
||||||
// Stored field values
|
|
||||||
// docNum -> field id -> slice of values (each value []byte)
|
|
||||||
Stored []map[uint16][][]byte |
|
||||||
|
|
||||||
// Stored field types
|
|
||||||
// docNum -> field id -> slice of types (each type byte)
|
|
||||||
StoredTypes []map[uint16][]byte |
|
||||||
|
|
||||||
// Stored field array positions
|
|
||||||
// docNum -> field id -> slice of array positions (each is []uint64)
|
|
||||||
StoredPos []map[uint16][][]uint64 |
|
||||||
|
|
||||||
// For storing the docValue persisted fields
|
|
||||||
DocValueFields map[uint16]bool |
|
||||||
|
|
||||||
// Footprint of the segment, updated when analyzed document mutations
|
|
||||||
// are added into the segment
|
|
||||||
sizeInBytes uint64 |
|
||||||
} |
|
||||||
|
|
||||||
// New builds a new empty Segment
|
|
||||||
func New() *Segment { |
|
||||||
return &Segment{ |
|
||||||
FieldsMap: map[string]uint16{}, |
|
||||||
DocValueFields: map[uint16]bool{}, |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
func (s *Segment) updateSizeInBytes() { |
|
||||||
var sizeInBytes uint64 |
|
||||||
|
|
||||||
// FieldsMap, FieldsInv
|
|
||||||
for k, _ := range s.FieldsMap { |
|
||||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + |
|
||||||
2 /* size of uint16 */) |
|
||||||
} |
|
||||||
// overhead from the data structures
|
|
||||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) |
|
||||||
|
|
||||||
// Dicts, DictKeys
|
|
||||||
for _, entry := range s.Dicts { |
|
||||||
for k, _ := range entry { |
|
||||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + |
|
||||||
8 /* size of uint64 */) |
|
||||||
} |
|
||||||
// overhead from the data structures
|
|
||||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) |
|
||||||
} |
|
||||||
sizeInBytes += (segment.SizeOfSlice * 2) |
|
||||||
|
|
||||||
// Postings, PostingsLocs
|
|
||||||
for i := 0; i < len(s.Postings); i++ { |
|
||||||
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) + |
|
||||||
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer) |
|
||||||
} |
|
||||||
sizeInBytes += (segment.SizeOfSlice * 2) |
|
||||||
|
|
||||||
// Freqs, Norms
|
|
||||||
for i := 0; i < len(s.Freqs); i++ { |
|
||||||
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + |
|
||||||
len(s.Norms[i])*4 /* size of float32 */) + |
|
||||||
(segment.SizeOfSlice * 2) |
|
||||||
} |
|
||||||
sizeInBytes += (segment.SizeOfSlice * 2) |
|
||||||
|
|
||||||
// Location data
|
|
||||||
for i := 0; i < len(s.Locfields); i++ { |
|
||||||
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + |
|
||||||
len(s.Locstarts[i])*8 /* size of uint64 */ + |
|
||||||
len(s.Locends[i])*8 /* size of uint64 */ + |
|
||||||
len(s.Locpos[i])*8 /* size of uint64 */) |
|
||||||
|
|
||||||
for j := 0; j < len(s.Locarraypos[i]); j++ { |
|
||||||
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) + |
|
||||||
segment.SizeOfSlice |
|
||||||
} |
|
||||||
|
|
||||||
sizeInBytes += (segment.SizeOfSlice * 5) |
|
||||||
} |
|
||||||
sizeInBytes += (segment.SizeOfSlice * 5) |
|
||||||
|
|
||||||
// Stored data
|
|
||||||
for i := 0; i < len(s.Stored); i++ { |
|
||||||
for _, v := range s.Stored[i] { |
|
||||||
sizeInBytes += uint64(2 /* size of uint16 */) |
|
||||||
for _, arr := range v { |
|
||||||
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice |
|
||||||
} |
|
||||||
sizeInBytes += segment.SizeOfSlice |
|
||||||
} |
|
||||||
|
|
||||||
for _, v := range s.StoredTypes[i] { |
|
||||||
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice |
|
||||||
} |
|
||||||
|
|
||||||
for _, v := range s.StoredPos[i] { |
|
||||||
sizeInBytes += uint64(2 /* size of uint16 */) |
|
||||||
for _, arr := range v { |
|
||||||
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) + |
|
||||||
segment.SizeOfSlice |
|
||||||
} |
|
||||||
sizeInBytes += segment.SizeOfSlice |
|
||||||
} |
|
||||||
|
|
||||||
// overhead from map(s) within Stored, StoredTypes, StoredPos
|
|
||||||
sizeInBytes += (segment.SizeOfMap * 3) |
|
||||||
} |
|
||||||
// overhead from data structures: Stored, StoredTypes, StoredPos
|
|
||||||
sizeInBytes += (segment.SizeOfSlice * 3) |
|
||||||
|
|
||||||
// DocValueFields
|
|
||||||
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) + |
|
||||||
segment.SizeOfMap |
|
||||||
|
|
||||||
// SizeInBytes
|
|
||||||
sizeInBytes += uint64(8) |
|
||||||
|
|
||||||
s.sizeInBytes = sizeInBytes |
|
||||||
} |
|
||||||
|
|
||||||
func (s *Segment) SizeInBytes() uint64 { |
|
||||||
return s.sizeInBytes |
|
||||||
} |
|
||||||
|
|
||||||
func (s *Segment) AddRef() { |
|
||||||
} |
|
||||||
|
|
||||||
func (s *Segment) DecRef() error { |
|
||||||
return nil |
|
||||||
} |
|
||||||
|
|
||||||
// Fields returns the field names used in this segment
|
|
||||||
func (s *Segment) Fields() []string { |
|
||||||
return s.FieldsInv |
|
||||||
} |
|
||||||
|
|
||||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
|
||||||
// for the specified doc number
|
|
||||||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { |
|
||||||
// ensure document number exists
|
|
||||||
if int(num) > len(s.Stored)-1 { |
|
||||||
return nil |
|
||||||
} |
|
||||||
docFields := s.Stored[int(num)] |
|
||||||
st := s.StoredTypes[int(num)] |
|
||||||
sp := s.StoredPos[int(num)] |
|
||||||
for field, values := range docFields { |
|
||||||
for i, value := range values { |
|
||||||
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) |
|
||||||
if !keepGoing { |
|
||||||
return nil |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
return nil |
|
||||||
} |
|
||||||
|
|
||||||
func (s *Segment) getField(name string) (int, error) { |
|
||||||
fieldID, ok := s.FieldsMap[name] |
|
||||||
if !ok { |
|
||||||
return 0, fmt.Errorf("no field named %s", name) |
|
||||||
} |
|
||||||
return int(fieldID - 1), nil |
|
||||||
} |
|
||||||
|
|
||||||
// Dictionary returns the term dictionary for the specified field
|
|
||||||
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { |
|
||||||
fieldID, err := s.getField(field) |
|
||||||
if err != nil { |
|
||||||
// no such field, return empty dictionary
|
|
||||||
return &segment.EmptyDictionary{}, nil |
|
||||||
} |
|
||||||
return &Dictionary{ |
|
||||||
segment: s, |
|
||||||
field: field, |
|
||||||
fieldID: uint16(fieldID), |
|
||||||
}, nil |
|
||||||
} |
|
||||||
|
|
||||||
// Count returns the number of documents in this segment
|
|
||||||
// (this has no notion of deleted docs)
|
|
||||||
func (s *Segment) Count() uint64 { |
|
||||||
return uint64(len(s.Stored)) |
|
||||||
} |
|
||||||
|
|
||||||
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
|
||||||
// provided _id strings
|
|
||||||
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { |
|
||||||
rv := roaring.New() |
|
||||||
|
|
||||||
// guard against empty segment
|
|
||||||
if len(s.FieldsMap) > 0 { |
|
||||||
idDictionary := s.Dicts[idFieldID] |
|
||||||
|
|
||||||
for _, id := range ids { |
|
||||||
postingID := idDictionary[id] |
|
||||||
if postingID > 0 { |
|
||||||
rv.Or(s.Postings[postingID-1]) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
return rv, nil |
|
||||||
} |
|
||||||
|
|
||||||
// Close releases all resources associated with this segment
|
|
||||||
func (s *Segment) Close() error { |
|
||||||
return nil |
|
||||||
} |
|
@ -0,0 +1,75 @@ |
|||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package segment |
||||||
|
|
||||||
|
import ( |
||||||
|
"regexp/syntax" |
||||||
|
|
||||||
|
"github.com/couchbase/vellum/regexp" |
||||||
|
) |
||||||
|
|
||||||
|
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) { |
||||||
|
// TODO: potential optimization where syntax.Regexp supports a Simplify() API?
|
||||||
|
|
||||||
|
parsed, err := syntax.Parse(pattern, syntax.Perl) |
||||||
|
if err != nil { |
||||||
|
return nil, nil, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit) |
||||||
|
if err != nil { |
||||||
|
return nil, nil, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
prefix := LiteralPrefix(parsed) |
||||||
|
if prefix != "" { |
||||||
|
prefixBeg := []byte(prefix) |
||||||
|
prefixEnd := IncrementBytes(prefixBeg) |
||||||
|
return re, prefixBeg, prefixEnd, nil |
||||||
|
} |
||||||
|
|
||||||
|
return re, nil, nil, nil |
||||||
|
} |
||||||
|
|
||||||
|
// Returns the literal prefix given the parse tree for a regexp
|
||||||
|
func LiteralPrefix(s *syntax.Regexp) string { |
||||||
|
// traverse the left-most branch in the parse tree as long as the
|
||||||
|
// node represents a concatenation
|
||||||
|
for s != nil && s.Op == syntax.OpConcat { |
||||||
|
if len(s.Sub) < 1 { |
||||||
|
return "" |
||||||
|
} |
||||||
|
|
||||||
|
s = s.Sub[0] |
||||||
|
} |
||||||
|
|
||||||
|
if s.Op == syntax.OpLiteral { |
||||||
|
return string(s.Rune) |
||||||
|
} |
||||||
|
|
||||||
|
return "" // no literal prefix
|
||||||
|
} |
||||||
|
|
||||||
|
func IncrementBytes(in []byte) []byte { |
||||||
|
rv := make([]byte, len(in)) |
||||||
|
copy(rv, in) |
||||||
|
for i := len(rv) - 1; i >= 0; i-- { |
||||||
|
rv[i] = rv[i] + 1 |
||||||
|
if rv[i] != 0 { |
||||||
|
return rv // didn't overflow, so stop
|
||||||
|
} |
||||||
|
} |
||||||
|
return nil // overflowed
|
||||||
|
} |
@ -0,0 +1,826 @@ |
|||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package zap |
||||||
|
|
||||||
|
import ( |
||||||
|
"bytes" |
||||||
|
"encoding/binary" |
||||||
|
"math" |
||||||
|
"sort" |
||||||
|
"sync" |
||||||
|
|
||||||
|
"github.com/RoaringBitmap/roaring" |
||||||
|
"github.com/blevesearch/bleve/analysis" |
||||||
|
"github.com/blevesearch/bleve/document" |
||||||
|
"github.com/blevesearch/bleve/index" |
||||||
|
"github.com/couchbase/vellum" |
||||||
|
"github.com/golang/snappy" |
||||||
|
) |
||||||
|
|
||||||
|
var NewSegmentBufferNumResultsBump int = 100 |
||||||
|
var NewSegmentBufferNumResultsFactor float64 = 1.0 |
||||||
|
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 |
||||||
|
|
||||||
|
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
|
||||||
|
// SegmentBase from analysis results
|
||||||
|
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, |
||||||
|
chunkFactor uint32) (*SegmentBase, uint64, error) { |
||||||
|
s := interimPool.Get().(*interim) |
||||||
|
|
||||||
|
var br bytes.Buffer |
||||||
|
if s.lastNumDocs > 0 { |
||||||
|
// use previous results to initialize the buf with an estimate
|
||||||
|
// size, but note that the interim instance comes from a
|
||||||
|
// global interimPool, so multiple scorch instances indexing
|
||||||
|
// different docs can lead to low quality estimates
|
||||||
|
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * |
||||||
|
NewSegmentBufferNumResultsFactor) |
||||||
|
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * |
||||||
|
NewSegmentBufferAvgBytesPerDocFactor) |
||||||
|
br.Grow(estimateAvgBytesPerDoc * estimateNumResults) |
||||||
|
} |
||||||
|
|
||||||
|
s.results = results |
||||||
|
s.chunkFactor = chunkFactor |
||||||
|
s.w = NewCountHashWriter(&br) |
||||||
|
|
||||||
|
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, |
||||||
|
err := s.convert() |
||||||
|
if err != nil { |
||||||
|
return nil, uint64(0), err |
||||||
|
} |
||||||
|
|
||||||
|
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, |
||||||
|
s.FieldsMap, s.FieldsInv, uint64(len(results)), |
||||||
|
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) |
||||||
|
|
||||||
|
if err == nil && s.reset() == nil { |
||||||
|
s.lastNumDocs = len(results) |
||||||
|
s.lastOutSize = len(br.Bytes()) |
||||||
|
interimPool.Put(s) |
||||||
|
} |
||||||
|
|
||||||
|
return sb, uint64(len(br.Bytes())), err |
||||||
|
} |
||||||
|
|
||||||
|
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} |
||||||
|
|
||||||
|
// interim holds temporary working data used while converting from
|
||||||
|
// analysis results to a zap-encoded segment
|
||||||
|
type interim struct { |
||||||
|
results []*index.AnalysisResult |
||||||
|
|
||||||
|
chunkFactor uint32 |
||||||
|
|
||||||
|
w *CountHashWriter |
||||||
|
|
||||||
|
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||||
|
// name -> field id + 1
|
||||||
|
FieldsMap map[string]uint16 |
||||||
|
|
||||||
|
// FieldsInv is the inverse of FieldsMap
|
||||||
|
// field id -> name
|
||||||
|
FieldsInv []string |
||||||
|
|
||||||
|
// Term dictionaries for each field
|
||||||
|
// field id -> term -> postings list id + 1
|
||||||
|
Dicts []map[string]uint64 |
||||||
|
|
||||||
|
// Terms for each field, where terms are sorted ascending
|
||||||
|
// field id -> []term
|
||||||
|
DictKeys [][]string |
||||||
|
|
||||||
|
// Fields whose IncludeDocValues is true
|
||||||
|
// field id -> bool
|
||||||
|
IncludeDocValues []bool |
||||||
|
|
||||||
|
// postings id -> bitmap of docNums
|
||||||
|
Postings []*roaring.Bitmap |
||||||
|
|
||||||
|
// postings id -> freq/norm's, one for each docNum in postings
|
||||||
|
FreqNorms [][]interimFreqNorm |
||||||
|
freqNormsBacking []interimFreqNorm |
||||||
|
|
||||||
|
// postings id -> locs, one for each freq
|
||||||
|
Locs [][]interimLoc |
||||||
|
locsBacking []interimLoc |
||||||
|
|
||||||
|
numTermsPerPostingsList []int // key is postings list id
|
||||||
|
numLocsPerPostingsList []int // key is postings list id
|
||||||
|
|
||||||
|
builder *vellum.Builder |
||||||
|
builderBuf bytes.Buffer |
||||||
|
|
||||||
|
metaBuf bytes.Buffer |
||||||
|
|
||||||
|
tmp0 []byte |
||||||
|
tmp1 []byte |
||||||
|
|
||||||
|
lastNumDocs int |
||||||
|
lastOutSize int |
||||||
|
} |
||||||
|
|
||||||
|
func (s *interim) reset() (err error) { |
||||||
|
s.results = nil |
||||||
|
s.chunkFactor = 0 |
||||||
|
s.w = nil |
||||||
|
s.FieldsMap = nil |
||||||
|
s.FieldsInv = nil |
||||||
|
for i := range s.Dicts { |
||||||
|
s.Dicts[i] = nil |
||||||
|
} |
||||||
|
s.Dicts = s.Dicts[:0] |
||||||
|
for i := range s.DictKeys { |
||||||
|
s.DictKeys[i] = s.DictKeys[i][:0] |
||||||
|
} |
||||||
|
s.DictKeys = s.DictKeys[:0] |
||||||
|
for i := range s.IncludeDocValues { |
||||||
|
s.IncludeDocValues[i] = false |
||||||
|
} |
||||||
|
s.IncludeDocValues = s.IncludeDocValues[:0] |
||||||
|
for _, idn := range s.Postings { |
||||||
|
idn.Clear() |
||||||
|
} |
||||||
|
s.Postings = s.Postings[:0] |
||||||
|
s.FreqNorms = s.FreqNorms[:0] |
||||||
|
for i := range s.freqNormsBacking { |
||||||
|
s.freqNormsBacking[i] = interimFreqNorm{} |
||||||
|
} |
||||||
|
s.freqNormsBacking = s.freqNormsBacking[:0] |
||||||
|
s.Locs = s.Locs[:0] |
||||||
|
for i := range s.locsBacking { |
||||||
|
s.locsBacking[i] = interimLoc{} |
||||||
|
} |
||||||
|
s.locsBacking = s.locsBacking[:0] |
||||||
|
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] |
||||||
|
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] |
||||||
|
s.builderBuf.Reset() |
||||||
|
if s.builder != nil { |
||||||
|
err = s.builder.Reset(&s.builderBuf) |
||||||
|
} |
||||||
|
s.metaBuf.Reset() |
||||||
|
s.tmp0 = s.tmp0[:0] |
||||||
|
s.tmp1 = s.tmp1[:0] |
||||||
|
s.lastNumDocs = 0 |
||||||
|
s.lastOutSize = 0 |
||||||
|
|
||||||
|
return err |
||||||
|
} |
||||||
|
|
||||||
|
func (s *interim) grabBuf(size int) []byte { |
||||||
|
buf := s.tmp0 |
||||||
|
if cap(buf) < size { |
||||||
|
buf = make([]byte, size) |
||||||
|
s.tmp0 = buf |
||||||
|
} |
||||||
|
return buf[0:size] |
||||||
|
} |
||||||
|
|
||||||
|
type interimStoredField struct { |
||||||
|
vals [][]byte |
||||||
|
typs []byte |
||||||
|
arrayposs [][]uint64 // array positions
|
||||||
|
} |
||||||
|
|
||||||
|
type interimFreqNorm struct { |
||||||
|
freq uint64 |
||||||
|
norm float32 |
||||||
|
numLocs int |
||||||
|
} |
||||||
|
|
||||||
|
type interimLoc struct { |
||||||
|
fieldID uint16 |
||||||
|
pos uint64 |
||||||
|
start uint64 |
||||||
|
end uint64 |
||||||
|
arrayposs []uint64 |
||||||
|
} |
||||||
|
|
||||||
|
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { |
||||||
|
s.FieldsMap = map[string]uint16{} |
||||||
|
|
||||||
|
s.getOrDefineField("_id") // _id field is fieldID 0
|
||||||
|
|
||||||
|
for _, result := range s.results { |
||||||
|
for _, field := range result.Document.CompositeFields { |
||||||
|
s.getOrDefineField(field.Name()) |
||||||
|
} |
||||||
|
for _, field := range result.Document.Fields { |
||||||
|
s.getOrDefineField(field.Name()) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||||
|
|
||||||
|
for fieldID, fieldName := range s.FieldsInv { |
||||||
|
s.FieldsMap[fieldName] = uint16(fieldID + 1) |
||||||
|
} |
||||||
|
|
||||||
|
if cap(s.IncludeDocValues) >= len(s.FieldsInv) { |
||||||
|
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] |
||||||
|
} else { |
||||||
|
s.IncludeDocValues = make([]bool, len(s.FieldsInv)) |
||||||
|
} |
||||||
|
|
||||||
|
s.prepareDicts() |
||||||
|
|
||||||
|
for _, dict := range s.DictKeys { |
||||||
|
sort.Strings(dict) |
||||||
|
} |
||||||
|
|
||||||
|
s.processDocuments() |
||||||
|
|
||||||
|
storedIndexOffset, err := s.writeStoredFields() |
||||||
|
if err != nil { |
||||||
|
return 0, 0, 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
var fdvIndexOffset uint64 |
||||||
|
var dictOffsets []uint64 |
||||||
|
|
||||||
|
if len(s.results) > 0 { |
||||||
|
fdvIndexOffset, dictOffsets, err = s.writeDicts() |
||||||
|
if err != nil { |
||||||
|
return 0, 0, 0, nil, err |
||||||
|
} |
||||||
|
} else { |
||||||
|
dictOffsets = make([]uint64, len(s.FieldsInv)) |
||||||
|
} |
||||||
|
|
||||||
|
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) |
||||||
|
if err != nil { |
||||||
|
return 0, 0, 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *interim) getOrDefineField(fieldName string) int { |
||||||
|
fieldIDPlus1, exists := s.FieldsMap[fieldName] |
||||||
|
if !exists { |
||||||
|
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) |
||||||
|
s.FieldsMap[fieldName] = fieldIDPlus1 |
||||||
|
s.FieldsInv = append(s.FieldsInv, fieldName) |
||||||
|
|
||||||
|
s.Dicts = append(s.Dicts, make(map[string]uint64)) |
||||||
|
|
||||||
|
n := len(s.DictKeys) |
||||||
|
if n < cap(s.DictKeys) { |
||||||
|
s.DictKeys = s.DictKeys[:n+1] |
||||||
|
s.DictKeys[n] = s.DictKeys[n][:0] |
||||||
|
} else { |
||||||
|
s.DictKeys = append(s.DictKeys, []string(nil)) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return int(fieldIDPlus1 - 1) |
||||||
|
} |
||||||
|
|
||||||
|
// fill Dicts and DictKeys from analysis results
|
||||||
|
func (s *interim) prepareDicts() { |
||||||
|
var pidNext int |
||||||
|
|
||||||
|
var totTFs int |
||||||
|
var totLocs int |
||||||
|
|
||||||
|
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { |
||||||
|
dict := s.Dicts[fieldID] |
||||||
|
dictKeys := s.DictKeys[fieldID] |
||||||
|
|
||||||
|
for term, tf := range tfs { |
||||||
|
pidPlus1, exists := dict[term] |
||||||
|
if !exists { |
||||||
|
pidNext++ |
||||||
|
pidPlus1 = uint64(pidNext) |
||||||
|
|
||||||
|
dict[term] = pidPlus1 |
||||||
|
dictKeys = append(dictKeys, term) |
||||||
|
|
||||||
|
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) |
||||||
|
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) |
||||||
|
} |
||||||
|
|
||||||
|
pid := pidPlus1 - 1 |
||||||
|
|
||||||
|
s.numTermsPerPostingsList[pid] += 1 |
||||||
|
s.numLocsPerPostingsList[pid] += len(tf.Locations) |
||||||
|
|
||||||
|
totLocs += len(tf.Locations) |
||||||
|
} |
||||||
|
|
||||||
|
totTFs += len(tfs) |
||||||
|
|
||||||
|
s.DictKeys[fieldID] = dictKeys |
||||||
|
} |
||||||
|
|
||||||
|
for _, result := range s.results { |
||||||
|
// walk each composite field
|
||||||
|
for _, field := range result.Document.CompositeFields { |
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||||
|
_, tf := field.Analyze() |
||||||
|
visitField(fieldID, tf) |
||||||
|
} |
||||||
|
|
||||||
|
// walk each field
|
||||||
|
for i, field := range result.Document.Fields { |
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||||
|
tf := result.Analyzed[i] |
||||||
|
visitField(fieldID, tf) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
numPostingsLists := pidNext |
||||||
|
|
||||||
|
if cap(s.Postings) >= numPostingsLists { |
||||||
|
s.Postings = s.Postings[:numPostingsLists] |
||||||
|
} else { |
||||||
|
postings := make([]*roaring.Bitmap, numPostingsLists) |
||||||
|
copy(postings, s.Postings[:cap(s.Postings)]) |
||||||
|
for i := 0; i < numPostingsLists; i++ { |
||||||
|
if postings[i] == nil { |
||||||
|
postings[i] = roaring.New() |
||||||
|
} |
||||||
|
} |
||||||
|
s.Postings = postings |
||||||
|
} |
||||||
|
|
||||||
|
if cap(s.FreqNorms) >= numPostingsLists { |
||||||
|
s.FreqNorms = s.FreqNorms[:numPostingsLists] |
||||||
|
} else { |
||||||
|
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) |
||||||
|
} |
||||||
|
|
||||||
|
if cap(s.freqNormsBacking) >= totTFs { |
||||||
|
s.freqNormsBacking = s.freqNormsBacking[:totTFs] |
||||||
|
} else { |
||||||
|
s.freqNormsBacking = make([]interimFreqNorm, totTFs) |
||||||
|
} |
||||||
|
|
||||||
|
freqNormsBacking := s.freqNormsBacking |
||||||
|
for pid, numTerms := range s.numTermsPerPostingsList { |
||||||
|
s.FreqNorms[pid] = freqNormsBacking[0:0] |
||||||
|
freqNormsBacking = freqNormsBacking[numTerms:] |
||||||
|
} |
||||||
|
|
||||||
|
if cap(s.Locs) >= numPostingsLists { |
||||||
|
s.Locs = s.Locs[:numPostingsLists] |
||||||
|
} else { |
||||||
|
s.Locs = make([][]interimLoc, numPostingsLists) |
||||||
|
} |
||||||
|
|
||||||
|
if cap(s.locsBacking) >= totLocs { |
||||||
|
s.locsBacking = s.locsBacking[:totLocs] |
||||||
|
} else { |
||||||
|
s.locsBacking = make([]interimLoc, totLocs) |
||||||
|
} |
||||||
|
|
||||||
|
locsBacking := s.locsBacking |
||||||
|
for pid, numLocs := range s.numLocsPerPostingsList { |
||||||
|
s.Locs[pid] = locsBacking[0:0] |
||||||
|
locsBacking = locsBacking[numLocs:] |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func (s *interim) processDocuments() { |
||||||
|
numFields := len(s.FieldsInv) |
||||||
|
reuseFieldLens := make([]int, numFields) |
||||||
|
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) |
||||||
|
|
||||||
|
for docNum, result := range s.results { |
||||||
|
for i := 0; i < numFields; i++ { // clear these for reuse
|
||||||
|
reuseFieldLens[i] = 0 |
||||||
|
reuseFieldTFs[i] = nil |
||||||
|
} |
||||||
|
|
||||||
|
s.processDocument(uint64(docNum), result, |
||||||
|
reuseFieldLens, reuseFieldTFs) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func (s *interim) processDocument(docNum uint64, |
||||||
|
result *index.AnalysisResult, |
||||||
|
fieldLens []int, fieldTFs []analysis.TokenFrequencies) { |
||||||
|
visitField := func(fieldID uint16, fieldName string, |
||||||
|
ln int, tf analysis.TokenFrequencies) { |
||||||
|
fieldLens[fieldID] += ln |
||||||
|
|
||||||
|
existingFreqs := fieldTFs[fieldID] |
||||||
|
if existingFreqs != nil { |
||||||
|
existingFreqs.MergeAll(fieldName, tf) |
||||||
|
} else { |
||||||
|
fieldTFs[fieldID] = tf |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// walk each composite field
|
||||||
|
for _, field := range result.Document.CompositeFields { |
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||||
|
ln, tf := field.Analyze() |
||||||
|
visitField(fieldID, field.Name(), ln, tf) |
||||||
|
} |
||||||
|
|
||||||
|
// walk each field
|
||||||
|
for i, field := range result.Document.Fields { |
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||||
|
ln := result.Length[i] |
||||||
|
tf := result.Analyzed[i] |
||||||
|
visitField(fieldID, field.Name(), ln, tf) |
||||||
|
} |
||||||
|
|
||||||
|
// now that it's been rolled up into fieldTFs, walk that
|
||||||
|
for fieldID, tfs := range fieldTFs { |
||||||
|
dict := s.Dicts[fieldID] |
||||||
|
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) |
||||||
|
|
||||||
|
for term, tf := range tfs { |
||||||
|
pid := dict[term] - 1 |
||||||
|
bs := s.Postings[pid] |
||||||
|
bs.Add(uint32(docNum)) |
||||||
|
|
||||||
|
s.FreqNorms[pid] = append(s.FreqNorms[pid], |
||||||
|
interimFreqNorm{ |
||||||
|
freq: uint64(tf.Frequency()), |
||||||
|
norm: norm, |
||||||
|
numLocs: len(tf.Locations), |
||||||
|
}) |
||||||
|
|
||||||
|
if len(tf.Locations) > 0 { |
||||||
|
locs := s.Locs[pid] |
||||||
|
|
||||||
|
for _, loc := range tf.Locations { |
||||||
|
var locf = uint16(fieldID) |
||||||
|
if loc.Field != "" { |
||||||
|
locf = uint16(s.getOrDefineField(loc.Field)) |
||||||
|
} |
||||||
|
var arrayposs []uint64 |
||||||
|
if len(loc.ArrayPositions) > 0 { |
||||||
|
arrayposs = loc.ArrayPositions |
||||||
|
} |
||||||
|
locs = append(locs, interimLoc{ |
||||||
|
fieldID: locf, |
||||||
|
pos: uint64(loc.Position), |
||||||
|
start: uint64(loc.Start), |
||||||
|
end: uint64(loc.End), |
||||||
|
arrayposs: arrayposs, |
||||||
|
}) |
||||||
|
} |
||||||
|
|
||||||
|
s.Locs[pid] = locs |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func (s *interim) writeStoredFields() ( |
||||||
|
storedIndexOffset uint64, err error) { |
||||||
|
varBuf := make([]byte, binary.MaxVarintLen64) |
||||||
|
metaEncode := func(val uint64) (int, error) { |
||||||
|
wb := binary.PutUvarint(varBuf, val) |
||||||
|
return s.metaBuf.Write(varBuf[:wb]) |
||||||
|
} |
||||||
|
|
||||||
|
data, compressed := s.tmp0[:0], s.tmp1[:0] |
||||||
|
defer func() { s.tmp0, s.tmp1 = data, compressed }() |
||||||
|
|
||||||
|
// keyed by docNum
|
||||||
|
docStoredOffsets := make([]uint64, len(s.results)) |
||||||
|
|
||||||
|
// keyed by fieldID, for the current doc in the loop
|
||||||
|
docStoredFields := map[uint16]interimStoredField{} |
||||||
|
|
||||||
|
for docNum, result := range s.results { |
||||||
|
for fieldID := range docStoredFields { // reset for next doc
|
||||||
|
delete(docStoredFields, fieldID) |
||||||
|
} |
||||||
|
|
||||||
|
for _, field := range result.Document.Fields { |
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name())) |
||||||
|
|
||||||
|
opts := field.Options() |
||||||
|
|
||||||
|
if opts.IsStored() { |
||||||
|
isf := docStoredFields[fieldID] |
||||||
|
isf.vals = append(isf.vals, field.Value()) |
||||||
|
isf.typs = append(isf.typs, encodeFieldType(field)) |
||||||
|
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) |
||||||
|
docStoredFields[fieldID] = isf |
||||||
|
} |
||||||
|
|
||||||
|
if opts.IncludeDocValues() { |
||||||
|
s.IncludeDocValues[fieldID] = true |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
var curr int |
||||||
|
|
||||||
|
s.metaBuf.Reset() |
||||||
|
data = data[:0] |
||||||
|
|
||||||
|
// _id field special case optimizes ExternalID() lookups
|
||||||
|
idFieldVal := docStoredFields[uint16(0)].vals[0] |
||||||
|
_, err = metaEncode(uint64(len(idFieldVal))) |
||||||
|
if err != nil { |
||||||
|
return 0, err |
||||||
|
} |
||||||
|
|
||||||
|
// handle non-"_id" fields
|
||||||
|
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { |
||||||
|
isf, exists := docStoredFields[uint16(fieldID)] |
||||||
|
if exists { |
||||||
|
curr, data, err = persistStoredFieldValues( |
||||||
|
fieldID, isf.vals, isf.typs, isf.arrayposs, |
||||||
|
curr, metaEncode, data) |
||||||
|
if err != nil { |
||||||
|
return 0, err |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
metaBytes := s.metaBuf.Bytes() |
||||||
|
|
||||||
|
compressed = snappy.Encode(compressed[:cap(compressed)], data) |
||||||
|
|
||||||
|
docStoredOffsets[docNum] = uint64(s.w.Count()) |
||||||
|
|
||||||
|
_, err := writeUvarints(s.w, |
||||||
|
uint64(len(metaBytes)), |
||||||
|
uint64(len(idFieldVal)+len(compressed))) |
||||||
|
if err != nil { |
||||||
|
return 0, err |
||||||
|
} |
||||||
|
|
||||||
|
_, err = s.w.Write(metaBytes) |
||||||
|
if err != nil { |
||||||
|
return 0, err |
||||||
|
} |
||||||
|
|
||||||
|
_, err = s.w.Write(idFieldVal) |
||||||
|
if err != nil { |
||||||
|
return 0, err |
||||||
|
} |
||||||
|
|
||||||
|
_, err = s.w.Write(compressed) |
||||||
|
if err != nil { |
||||||
|
return 0, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
storedIndexOffset = uint64(s.w.Count()) |
||||||
|
|
||||||
|
for _, docStoredOffset := range docStoredOffsets { |
||||||
|
err = binary.Write(s.w, binary.BigEndian, docStoredOffset) |
||||||
|
if err != nil { |
||||||
|
return 0, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return storedIndexOffset, nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { |
||||||
|
dictOffsets = make([]uint64, len(s.FieldsInv)) |
||||||
|
|
||||||
|
fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) |
||||||
|
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) |
||||||
|
|
||||||
|
buf := s.grabBuf(binary.MaxVarintLen64) |
||||||
|
|
||||||
|
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) |
||||||
|
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) |
||||||
|
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false) |
||||||
|
|
||||||
|
var docTermMap [][]byte |
||||||
|
|
||||||
|
if s.builder == nil { |
||||||
|
s.builder, err = vellum.New(&s.builderBuf, nil) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
for fieldID, terms := range s.DictKeys { |
||||||
|
if cap(docTermMap) < len(s.results) { |
||||||
|
docTermMap = make([][]byte, len(s.results)) |
||||||
|
} else { |
||||||
|
docTermMap = docTermMap[0:len(s.results)] |
||||||
|
for docNum := range docTermMap { // reset the docTermMap
|
||||||
|
docTermMap[docNum] = docTermMap[docNum][:0] |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
dict := s.Dicts[fieldID] |
||||||
|
|
||||||
|
for _, term := range terms { // terms are already sorted
|
||||||
|
pid := dict[term] - 1 |
||||||
|
|
||||||
|
postingsBS := s.Postings[pid] |
||||||
|
|
||||||
|
freqNorms := s.FreqNorms[pid] |
||||||
|
freqNormOffset := 0 |
||||||
|
|
||||||
|
locs := s.Locs[pid] |
||||||
|
locOffset := 0 |
||||||
|
|
||||||
|
postingsItr := postingsBS.Iterator() |
||||||
|
for postingsItr.HasNext() { |
||||||
|
docNum := uint64(postingsItr.Next()) |
||||||
|
|
||||||
|
freqNorm := freqNorms[freqNormOffset] |
||||||
|
|
||||||
|
err = tfEncoder.Add(docNum, |
||||||
|
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), |
||||||
|
uint64(math.Float32bits(freqNorm.norm))) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
if freqNorm.numLocs > 0 { |
||||||
|
numBytesLocs := 0 |
||||||
|
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { |
||||||
|
numBytesLocs += totalUvarintBytes( |
||||||
|
uint64(loc.fieldID), loc.pos, loc.start, loc.end, |
||||||
|
uint64(len(loc.arrayposs)), loc.arrayposs) |
||||||
|
} |
||||||
|
|
||||||
|
err = locEncoder.Add(docNum, uint64(numBytesLocs)) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { |
||||||
|
err = locEncoder.Add(docNum, |
||||||
|
uint64(loc.fieldID), loc.pos, loc.start, loc.end, |
||||||
|
uint64(len(loc.arrayposs))) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
err = locEncoder.Add(docNum, loc.arrayposs...) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
locOffset += freqNorm.numLocs |
||||||
|
} |
||||||
|
|
||||||
|
freqNormOffset++ |
||||||
|
|
||||||
|
docTermMap[docNum] = append( |
||||||
|
append(docTermMap[docNum], term...), |
||||||
|
termSeparator) |
||||||
|
} |
||||||
|
|
||||||
|
tfEncoder.Close() |
||||||
|
locEncoder.Close() |
||||||
|
|
||||||
|
postingsOffset, err := |
||||||
|
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
if postingsOffset > uint64(0) { |
||||||
|
err = s.builder.Insert([]byte(term), postingsOffset) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
tfEncoder.Reset() |
||||||
|
locEncoder.Reset() |
||||||
|
} |
||||||
|
|
||||||
|
err = s.builder.Close() |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
// record where this dictionary starts
|
||||||
|
dictOffsets[fieldID] = uint64(s.w.Count()) |
||||||
|
|
||||||
|
vellumData := s.builderBuf.Bytes() |
||||||
|
|
||||||
|
// write out the length of the vellum data
|
||||||
|
n := binary.PutUvarint(buf, uint64(len(vellumData))) |
||||||
|
_, err = s.w.Write(buf[:n]) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
// write this vellum to disk
|
||||||
|
_, err = s.w.Write(vellumData) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
// reset vellum for reuse
|
||||||
|
s.builderBuf.Reset() |
||||||
|
|
||||||
|
err = s.builder.Reset(&s.builderBuf) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
// write the field doc values
|
||||||
|
if s.IncludeDocValues[fieldID] { |
||||||
|
for docNum, docTerms := range docTermMap { |
||||||
|
if len(docTerms) > 0 { |
||||||
|
err = fdvEncoder.Add(uint64(docNum), docTerms) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
err = fdvEncoder.Close() |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
fdvOffsetsStart[fieldID] = uint64(s.w.Count()) |
||||||
|
|
||||||
|
_, err = fdvEncoder.Write() |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
|
||||||
|
fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) |
||||||
|
|
||||||
|
fdvEncoder.Reset() |
||||||
|
} else { |
||||||
|
fdvOffsetsStart[fieldID] = fieldNotUninverted |
||||||
|
fdvOffsetsEnd[fieldID] = fieldNotUninverted |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
fdvIndexOffset = uint64(s.w.Count()) |
||||||
|
|
||||||
|
for i := 0; i < len(fdvOffsetsStart); i++ { |
||||||
|
n := binary.PutUvarint(buf, fdvOffsetsStart[i]) |
||||||
|
_, err := s.w.Write(buf[:n]) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) |
||||||
|
_, err = s.w.Write(buf[:n]) |
||||||
|
if err != nil { |
||||||
|
return 0, nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return fdvIndexOffset, dictOffsets, nil |
||||||
|
} |
||||||
|
|
||||||
|
func encodeFieldType(f document.Field) byte { |
||||||
|
fieldType := byte('x') |
||||||
|
switch f.(type) { |
||||||
|
case *document.TextField: |
||||||
|
fieldType = 't' |
||||||
|
case *document.NumericField: |
||||||
|
fieldType = 'n' |
||||||
|
case *document.DateTimeField: |
||||||
|
fieldType = 'd' |
||||||
|
case *document.BooleanField: |
||||||
|
fieldType = 'b' |
||||||
|
case *document.GeoPointField: |
||||||
|
fieldType = 'g' |
||||||
|
case *document.CompositeField: |
||||||
|
fieldType = 'c' |
||||||
|
} |
||||||
|
return fieldType |
||||||
|
} |
||||||
|
|
||||||
|
// returns the total # of bytes needed to encode the given uint64's
|
||||||
|
// into binary.PutUVarint() encoding
|
||||||
|
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { |
||||||
|
n = numUvarintBytes(a) |
||||||
|
n += numUvarintBytes(b) |
||||||
|
n += numUvarintBytes(c) |
||||||
|
n += numUvarintBytes(d) |
||||||
|
n += numUvarintBytes(e) |
||||||
|
for _, v := range more { |
||||||
|
n += numUvarintBytes(v) |
||||||
|
} |
||||||
|
return n |
||||||
|
} |
||||||
|
|
||||||
|
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
|
||||||
|
func numUvarintBytes(x uint64) (n int) { |
||||||
|
for x >= 0x80 { |
||||||
|
x >>= 7 |
||||||
|
n++ |
||||||
|
} |
||||||
|
return n + 1 |
||||||
|
} |
File diff suppressed because it is too large
Load Diff
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
@ -0,0 +1,343 @@ |
|||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package searcher |
||||||
|
|
||||||
|
import ( |
||||||
|
"bytes" |
||||||
|
"container/heap" |
||||||
|
"math" |
||||||
|
"reflect" |
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/index" |
||||||
|
"github.com/blevesearch/bleve/search" |
||||||
|
"github.com/blevesearch/bleve/search/scorer" |
||||||
|
"github.com/blevesearch/bleve/size" |
||||||
|
) |
||||||
|
|
||||||
|
var reflectStaticSizeDisjunctionHeapSearcher int |
||||||
|
var reflectStaticSizeSearcherCurr int |
||||||
|
|
||||||
|
func init() { |
||||||
|
var dhs DisjunctionHeapSearcher |
||||||
|
reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size()) |
||||||
|
|
||||||
|
var sc SearcherCurr |
||||||
|
reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size()) |
||||||
|
} |
||||||
|
|
||||||
|
type SearcherCurr struct { |
||||||
|
searcher search.Searcher |
||||||
|
curr *search.DocumentMatch |
||||||
|
} |
||||||
|
|
||||||
|
type DisjunctionHeapSearcher struct { |
||||||
|
indexReader index.IndexReader |
||||||
|
|
||||||
|
numSearchers int |
||||||
|
scorer *scorer.DisjunctionQueryScorer |
||||||
|
min int |
||||||
|
queryNorm float64 |
||||||
|
initialized bool |
||||||
|
searchers []search.Searcher |
||||||
|
heap []*SearcherCurr |
||||||
|
|
||||||
|
matching []*search.DocumentMatch |
||||||
|
matchingCurrs []*SearcherCurr |
||||||
|
} |
||||||
|
|
||||||
|
func newDisjunctionHeapSearcher(indexReader index.IndexReader, |
||||||
|
searchers []search.Searcher, min float64, options search.SearcherOptions, |
||||||
|
limit bool) ( |
||||||
|
*DisjunctionHeapSearcher, error) { |
||||||
|
if limit && tooManyClauses(len(searchers)) { |
||||||
|
return nil, tooManyClausesErr(len(searchers)) |
||||||
|
} |
||||||
|
|
||||||
|
// build our searcher
|
||||||
|
rv := DisjunctionHeapSearcher{ |
||||||
|
indexReader: indexReader, |
||||||
|
searchers: searchers, |
||||||
|
numSearchers: len(searchers), |
||||||
|
scorer: scorer.NewDisjunctionQueryScorer(options), |
||||||
|
min: int(min), |
||||||
|
matching: make([]*search.DocumentMatch, len(searchers)), |
||||||
|
matchingCurrs: make([]*SearcherCurr, len(searchers)), |
||||||
|
heap: make([]*SearcherCurr, 0, len(searchers)), |
||||||
|
} |
||||||
|
rv.computeQueryNorm() |
||||||
|
return &rv, nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Size() int { |
||||||
|
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr + |
||||||
|
s.scorer.Size() |
||||||
|
|
||||||
|
for _, entry := range s.searchers { |
||||||
|
sizeInBytes += entry.Size() |
||||||
|
} |
||||||
|
|
||||||
|
for _, entry := range s.matching { |
||||||
|
if entry != nil { |
||||||
|
sizeInBytes += entry.Size() |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// for matchingCurrs and heap, just use static size * len
|
||||||
|
// since searchers and document matches already counted above
|
||||||
|
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr |
||||||
|
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr |
||||||
|
|
||||||
|
return sizeInBytes |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) computeQueryNorm() { |
||||||
|
// first calculate sum of squared weights
|
||||||
|
sumOfSquaredWeights := 0.0 |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
sumOfSquaredWeights += searcher.Weight() |
||||||
|
} |
||||||
|
// now compute query norm from this
|
||||||
|
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) |
||||||
|
// finally tell all the downstream searchers the norm
|
||||||
|
for _, searcher := range s.searchers { |
||||||
|
searcher.SetQueryNorm(s.queryNorm) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error { |
||||||
|
// alloc a single block of SearcherCurrs
|
||||||
|
block := make([]SearcherCurr, len(s.searchers)) |
||||||
|
|
||||||
|
// get all searchers pointing at their first match
|
||||||
|
for i, searcher := range s.searchers { |
||||||
|
curr, err := searcher.Next(ctx) |
||||||
|
if err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
if curr != nil { |
||||||
|
block[i].searcher = searcher |
||||||
|
block[i].curr = curr |
||||||
|
heap.Push(s, &block[i]) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
err := s.updateMatches() |
||||||
|
if err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
s.initialized = true |
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) updateMatches() error { |
||||||
|
matching := s.matching[:0] |
||||||
|
matchingCurrs := s.matchingCurrs[:0] |
||||||
|
|
||||||
|
if len(s.heap) > 0 { |
||||||
|
|
||||||
|
// top of the heap is our next hit
|
||||||
|
next := heap.Pop(s).(*SearcherCurr) |
||||||
|
matching = append(matching, next.curr) |
||||||
|
matchingCurrs = append(matchingCurrs, next) |
||||||
|
|
||||||
|
// now as long as top of heap matches, keep popping
|
||||||
|
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { |
||||||
|
next = heap.Pop(s).(*SearcherCurr) |
||||||
|
matching = append(matching, next.curr) |
||||||
|
matchingCurrs = append(matchingCurrs, next) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
s.matching = matching |
||||||
|
s.matchingCurrs = matchingCurrs |
||||||
|
|
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Weight() float64 { |
||||||
|
var rv float64 |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
rv += searcher.Weight() |
||||||
|
} |
||||||
|
return rv |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) { |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
searcher.SetQueryNorm(qnorm) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) ( |
||||||
|
*search.DocumentMatch, error) { |
||||||
|
if !s.initialized { |
||||||
|
err := s.initSearchers(ctx) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
var rv *search.DocumentMatch |
||||||
|
found := false |
||||||
|
for !found && len(s.matching) > 0 { |
||||||
|
if len(s.matching) >= s.min { |
||||||
|
found = true |
||||||
|
// score this match
|
||||||
|
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) |
||||||
|
} |
||||||
|
|
||||||
|
// invoke next on all the matching searchers
|
||||||
|
for _, matchingCurr := range s.matchingCurrs { |
||||||
|
if matchingCurr.curr != rv { |
||||||
|
ctx.DocumentMatchPool.Put(matchingCurr.curr) |
||||||
|
} |
||||||
|
curr, err := matchingCurr.searcher.Next(ctx) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
if curr != nil { |
||||||
|
matchingCurr.curr = curr |
||||||
|
heap.Push(s, matchingCurr) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
err := s.updateMatches() |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return rv, nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, |
||||||
|
ID index.IndexInternalID) (*search.DocumentMatch, error) { |
||||||
|
if !s.initialized { |
||||||
|
err := s.initSearchers(ctx) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// if there is anything in matching, toss it back onto the heap
|
||||||
|
for _, matchingCurr := range s.matchingCurrs { |
||||||
|
heap.Push(s, matchingCurr) |
||||||
|
} |
||||||
|
s.matching = s.matching[:0] |
||||||
|
s.matchingCurrs = s.matchingCurrs[:0] |
||||||
|
|
||||||
|
// find all searchers that actually need to be advanced
|
||||||
|
// advance them, using s.matchingCurrs as temp storage
|
||||||
|
for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { |
||||||
|
searcherCurr := heap.Pop(s).(*SearcherCurr) |
||||||
|
ctx.DocumentMatchPool.Put(searcherCurr.curr) |
||||||
|
curr, err := searcherCurr.searcher.Advance(ctx, ID) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
if curr != nil { |
||||||
|
searcherCurr.curr = curr |
||||||
|
s.matchingCurrs = append(s.matchingCurrs, searcherCurr) |
||||||
|
} |
||||||
|
} |
||||||
|
// now all of the searchers that we advanced have to be pushed back
|
||||||
|
for _, matchingCurr := range s.matchingCurrs { |
||||||
|
heap.Push(s, matchingCurr) |
||||||
|
} |
||||||
|
// reset our temp space
|
||||||
|
s.matchingCurrs = s.matchingCurrs[:0] |
||||||
|
|
||||||
|
err := s.updateMatches() |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
|
||||||
|
return s.Next(ctx) |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Count() uint64 { |
||||||
|
// for now return a worst case
|
||||||
|
var sum uint64 |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
sum += searcher.Count() |
||||||
|
} |
||||||
|
return sum |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Close() (rv error) { |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
err := searcher.Close() |
||||||
|
if err != nil && rv == nil { |
||||||
|
rv = err |
||||||
|
} |
||||||
|
} |
||||||
|
return rv |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Min() int { |
||||||
|
return s.min |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int { |
||||||
|
rv := len(s.searchers) |
||||||
|
for _, s := range s.searchers { |
||||||
|
rv += s.DocumentMatchPoolSize() |
||||||
|
} |
||||||
|
return rv |
||||||
|
} |
||||||
|
|
||||||
|
// a disjunction searcher implements the index.Optimizable interface
|
||||||
|
// but only activates on an edge case where the disjunction is a
|
||||||
|
// wrapper around a single Optimizable child searcher
|
||||||
|
func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) ( |
||||||
|
index.OptimizableContext, error) { |
||||||
|
if len(s.searchers) == 1 { |
||||||
|
o, ok := s.searchers[0].(index.Optimizable) |
||||||
|
if ok { |
||||||
|
return o.Optimize(kind, octx) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return octx, nil |
||||||
|
} |
||||||
|
|
||||||
|
// heap impl
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) } |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Less(i, j int) bool { |
||||||
|
if s.heap[i].curr == nil { |
||||||
|
return true |
||||||
|
} else if s.heap[j].curr == nil { |
||||||
|
return false |
||||||
|
} |
||||||
|
return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Swap(i, j int) { |
||||||
|
s.heap[i], s.heap[j] = s.heap[j], s.heap[i] |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Push(x interface{}) { |
||||||
|
s.heap = append(s.heap, x.(*SearcherCurr)) |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Pop() interface{} { |
||||||
|
old := s.heap |
||||||
|
n := len(old) |
||||||
|
x := old[n-1] |
||||||
|
s.heap = old[0 : n-1] |
||||||
|
return x |
||||||
|
} |
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
@ -0,0 +1,298 @@ |
|||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package searcher |
||||||
|
|
||||||
|
import ( |
||||||
|
"math" |
||||||
|
"reflect" |
||||||
|
"sort" |
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/index" |
||||||
|
"github.com/blevesearch/bleve/search" |
||||||
|
"github.com/blevesearch/bleve/search/scorer" |
||||||
|
"github.com/blevesearch/bleve/size" |
||||||
|
) |
||||||
|
|
||||||
|
var reflectStaticSizeDisjunctionSliceSearcher int |
||||||
|
|
||||||
|
func init() { |
||||||
|
var ds DisjunctionSliceSearcher |
||||||
|
reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size()) |
||||||
|
} |
||||||
|
|
||||||
|
type DisjunctionSliceSearcher struct { |
||||||
|
indexReader index.IndexReader |
||||||
|
searchers OrderedSearcherList |
||||||
|
numSearchers int |
||||||
|
queryNorm float64 |
||||||
|
currs []*search.DocumentMatch |
||||||
|
scorer *scorer.DisjunctionQueryScorer |
||||||
|
min int |
||||||
|
matching []*search.DocumentMatch |
||||||
|
matchingIdxs []int |
||||||
|
initialized bool |
||||||
|
} |
||||||
|
|
||||||
|
func newDisjunctionSliceSearcher(indexReader index.IndexReader, |
||||||
|
qsearchers []search.Searcher, min float64, options search.SearcherOptions, |
||||||
|
limit bool) ( |
||||||
|
*DisjunctionSliceSearcher, error) { |
||||||
|
if limit && tooManyClauses(len(qsearchers)) { |
||||||
|
return nil, tooManyClausesErr(len(qsearchers)) |
||||||
|
} |
||||||
|
// build the downstream searchers
|
||||||
|
searchers := make(OrderedSearcherList, len(qsearchers)) |
||||||
|
for i, searcher := range qsearchers { |
||||||
|
searchers[i] = searcher |
||||||
|
} |
||||||
|
// sort the searchers
|
||||||
|
sort.Sort(sort.Reverse(searchers)) |
||||||
|
// build our searcher
|
||||||
|
rv := DisjunctionSliceSearcher{ |
||||||
|
indexReader: indexReader, |
||||||
|
searchers: searchers, |
||||||
|
numSearchers: len(searchers), |
||||||
|
currs: make([]*search.DocumentMatch, len(searchers)), |
||||||
|
scorer: scorer.NewDisjunctionQueryScorer(options), |
||||||
|
min: int(min), |
||||||
|
matching: make([]*search.DocumentMatch, len(searchers)), |
||||||
|
matchingIdxs: make([]int, len(searchers)), |
||||||
|
} |
||||||
|
rv.computeQueryNorm() |
||||||
|
return &rv, nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Size() int { |
||||||
|
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr + |
||||||
|
s.scorer.Size() |
||||||
|
|
||||||
|
for _, entry := range s.searchers { |
||||||
|
sizeInBytes += entry.Size() |
||||||
|
} |
||||||
|
|
||||||
|
for _, entry := range s.currs { |
||||||
|
if entry != nil { |
||||||
|
sizeInBytes += entry.Size() |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
for _, entry := range s.matching { |
||||||
|
if entry != nil { |
||||||
|
sizeInBytes += entry.Size() |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt |
||||||
|
|
||||||
|
return sizeInBytes |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) computeQueryNorm() { |
||||||
|
// first calculate sum of squared weights
|
||||||
|
sumOfSquaredWeights := 0.0 |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
sumOfSquaredWeights += searcher.Weight() |
||||||
|
} |
||||||
|
// now compute query norm from this
|
||||||
|
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) |
||||||
|
// finally tell all the downstream searchers the norm
|
||||||
|
for _, searcher := range s.searchers { |
||||||
|
searcher.SetQueryNorm(s.queryNorm) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error { |
||||||
|
var err error |
||||||
|
// get all searchers pointing at their first match
|
||||||
|
for i, searcher := range s.searchers { |
||||||
|
if s.currs[i] != nil { |
||||||
|
ctx.DocumentMatchPool.Put(s.currs[i]) |
||||||
|
} |
||||||
|
s.currs[i], err = searcher.Next(ctx) |
||||||
|
if err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
err = s.updateMatches() |
||||||
|
if err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
|
||||||
|
s.initialized = true |
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) updateMatches() error { |
||||||
|
matching := s.matching[:0] |
||||||
|
matchingIdxs := s.matchingIdxs[:0] |
||||||
|
|
||||||
|
for i := 0; i < len(s.currs); i++ { |
||||||
|
curr := s.currs[i] |
||||||
|
if curr == nil { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
if len(matching) > 0 { |
||||||
|
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) |
||||||
|
if cmp > 0 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
if cmp < 0 { |
||||||
|
matching = matching[:0] |
||||||
|
matchingIdxs = matchingIdxs[:0] |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
matching = append(matching, curr) |
||||||
|
matchingIdxs = append(matchingIdxs, i) |
||||||
|
} |
||||||
|
|
||||||
|
s.matching = matching |
||||||
|
s.matchingIdxs = matchingIdxs |
||||||
|
|
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Weight() float64 { |
||||||
|
var rv float64 |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
rv += searcher.Weight() |
||||||
|
} |
||||||
|
return rv |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) { |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
searcher.SetQueryNorm(qnorm) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( |
||||||
|
*search.DocumentMatch, error) { |
||||||
|
if !s.initialized { |
||||||
|
err := s.initSearchers(ctx) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
var err error |
||||||
|
var rv *search.DocumentMatch |
||||||
|
|
||||||
|
found := false |
||||||
|
for !found && len(s.matching) > 0 { |
||||||
|
if len(s.matching) >= s.min { |
||||||
|
found = true |
||||||
|
// score this match
|
||||||
|
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) |
||||||
|
} |
||||||
|
|
||||||
|
// invoke next on all the matching searchers
|
||||||
|
for _, i := range s.matchingIdxs { |
||||||
|
searcher := s.searchers[i] |
||||||
|
if s.currs[i] != rv { |
||||||
|
ctx.DocumentMatchPool.Put(s.currs[i]) |
||||||
|
} |
||||||
|
s.currs[i], err = searcher.Next(ctx) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
err = s.updateMatches() |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
return rv, nil |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext, |
||||||
|
ID index.IndexInternalID) (*search.DocumentMatch, error) { |
||||||
|
if !s.initialized { |
||||||
|
err := s.initSearchers(ctx) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
// get all searchers pointing at their first match
|
||||||
|
var err error |
||||||
|
for i, searcher := range s.searchers { |
||||||
|
if s.currs[i] != nil { |
||||||
|
if s.currs[i].IndexInternalID.Compare(ID) >= 0 { |
||||||
|
continue |
||||||
|
} |
||||||
|
ctx.DocumentMatchPool.Put(s.currs[i]) |
||||||
|
} |
||||||
|
s.currs[i], err = searcher.Advance(ctx, ID) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
err = s.updateMatches() |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
|
||||||
|
return s.Next(ctx) |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Count() uint64 { |
||||||
|
// for now return a worst case
|
||||||
|
var sum uint64 |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
sum += searcher.Count() |
||||||
|
} |
||||||
|
return sum |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Close() (rv error) { |
||||||
|
for _, searcher := range s.searchers { |
||||||
|
err := searcher.Close() |
||||||
|
if err != nil && rv == nil { |
||||||
|
rv = err |
||||||
|
} |
||||||
|
} |
||||||
|
return rv |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Min() int { |
||||||
|
return s.min |
||||||
|
} |
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { |
||||||
|
rv := len(s.currs) |
||||||
|
for _, s := range s.searchers { |
||||||
|
rv += s.DocumentMatchPoolSize() |
||||||
|
} |
||||||
|
return rv |
||||||
|
} |
||||||
|
|
||||||
|
// a disjunction searcher implements the index.Optimizable interface
|
||||||
|
// but only activates on an edge case where the disjunction is a
|
||||||
|
// wrapper around a single Optimizable child searcher
|
||||||
|
func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) ( |
||||||
|
index.OptimizableContext, error) { |
||||||
|
if len(s.searchers) == 1 { |
||||||
|
o, ok := s.searchers[0].(index.Optimizable) |
||||||
|
if ok { |
||||||
|
return o.Optimize(kind, octx) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return octx, nil |
||||||
|
} |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue