diff --git a/Gopkg.lock b/Gopkg.lock index 3419a83fd..0fe028a5e 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -40,14 +40,6 @@ revision = "1a28a7fa985680f9f4e1644c0a857ec359a444b0" version = "v0.4.7" -[[projects]] - branch = "master" - digest = "1:93367b6d47a8ccc7d14f9f493ccf103ccf5afb698559ff8e8f1999427ce27ace" - name = "github.com/Smerity/govarint" - packages = ["."] - pruneopts = "NUT" - revision = "7265e41f48f15fd61751e16da866af3c704bb3ab" - [[projects]] branch = "master" digest = "1:d290f4b25abbf574f80f60c8a5603ddada784f13f436b91a9a927bc7ce5a0146" @@ -98,7 +90,8 @@ revision = "3a771d992973f24aa725d07868b467d1ddfceafb" [[projects]] - digest = "1:c10f35be6200b09e26da267ca80f837315093ecaba27e7a223071380efb9dd32" + branch = "master" + digest = "1:b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2" name = "github.com/blevesearch/bleve" packages = [ ".", @@ -121,7 +114,6 @@ "index/scorch", "index/scorch/mergeplan", "index/scorch/segment", - "index/scorch/segment/mem", "index/scorch/segment/zap", "index/store", "index/store/boltdb", @@ -141,9 +133,10 @@ "search/query", "search/scorer", "search/searcher", + "size", ] pruneopts = "NUT" - revision = "c74e08f039e56cef576e4336382b2a2d12d9e026" + revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5" [[projects]] branch = "master" @@ -160,14 +153,6 @@ pruneopts = "NUT" revision = "db70c57796cc8c310613541dfade3dce627d09c7" -[[projects]] - digest = "1:c7e0968c05659f3973148cd5c5387d6ee960a6ae1b2eaaec0b1d435d806458bb" - name = "github.com/boltdb/bolt" - packages = ["."] - pruneopts = "NUT" - revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7" - source = "github.com/go-gitea/bolt" - [[projects]] digest = "1:7c96cf7bf7f52af67f7a8222185813b9b665f5172ec2ac5f7d49ed96e5fcf3e5" name = "github.com/boombuler/barcode" @@ -217,15 +202,16 @@ [[projects]] branch = "master" - digest = "1:82e1ad11d777f7bff9a1fc678a8a534a318f85e5026a8a4d6f4a94a6b0678bb6" + digest = "1:6a658ac7d23204dc743c7155557c45273747d78e05ae0579742bd6b744bce215" name = "github.com/couchbase/vellum" packages = [ ".", + "levenshtein2", "regexp", "utf8", ] pruneopts = "NUT" - revision = "eb6ae3743b3f300f2136f83ca78c08cc071edbd4" + revision = "e91b68ff3efe3cc11723aa25dd315cbc9276cd65" [[projects]] branch = "master" @@ -287,6 +273,14 @@ revision = "1615341f118ae12f353cc8a983f35b584342c9b3" version = "v1.12.0" +[[projects]] + digest = "1:ae8eea1a24ae43a46c2e96631b6303fcc4210ca0ac9d643e4da965029d1b511d" + name = "github.com/etcd-io/bbolt" + packages = ["."] + pruneopts = "NUT" + revision = "63597a96ec0ad9e6d43c3fc81e809909e0237461" + version = "v1.3.2" + [[projects]] digest = "1:8603f74d35c93b37c615a02ba297be2cf2efc9ff6f1ff2b458a903990b568e48" name = "github.com/ethantkoenig/rupture" diff --git a/Gopkg.toml b/Gopkg.toml index 51f2b2cab..94f15079b 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -15,10 +15,8 @@ ignored = ["google.golang.org/appengine*"] name = "code.gitea.io/sdk" [[constraint]] -# branch = "master" - revision = "c74e08f039e56cef576e4336382b2a2d12d9e026" + revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5" name = "github.com/blevesearch/bleve" -#Not targetting v0.7.0 since standard where use only just after this tag [[constraint]] revision = "12dd70caea0268ac0d6c2707d0611ef601e7c64e" @@ -108,11 +106,6 @@ ignored = ["google.golang.org/appengine*"] name = "gopkg.in/testfixtures.v2" version = "2.0.0" -[[override]] - name = "github.com/boltdb/bolt" - revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7" - source = "github.com/go-gitea/bolt" - [[override]] branch = "master" name = "golang.org/x/oauth2" diff --git a/vendor/github.com/Smerity/govarint/LICENSE b/vendor/github.com/Smerity/govarint/LICENSE deleted file mode 100644 index be09cac86..000000000 --- a/vendor/github.com/Smerity/govarint/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Stephen Merity - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - diff --git a/vendor/github.com/Smerity/govarint/govarint.go b/vendor/github.com/Smerity/govarint/govarint.go deleted file mode 100644 index 61328a337..000000000 --- a/vendor/github.com/Smerity/govarint/govarint.go +++ /dev/null @@ -1,229 +0,0 @@ -package govarint - -import "encoding/binary" -import "io" - -type U32VarintEncoder interface { - PutU32(x uint32) int - Close() -} - -type U32VarintDecoder interface { - GetU32() (uint32, error) -} - -/// - -type U64VarintEncoder interface { - PutU64(x uint64) int - Close() -} - -type U64VarintDecoder interface { - GetU64() (uint64, error) -} - -/// - -type U32GroupVarintEncoder struct { - w io.Writer - index int - store [4]uint32 - temp [17]byte -} - -func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} } - -func (b *U32GroupVarintEncoder) Flush() (int, error) { - // TODO: Is it more efficient to have a tailored version that's called only in Close()? - // If index is zero, there are no integers to flush - if b.index == 0 { - return 0, nil - } - // In the case we're flushing (the group isn't of size four), the non-values should be zero - // This ensures the unused entries are all zero in the sizeByte - for i := b.index; i < 4; i++ { - b.store[i] = 0 - } - length := 1 - // We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it - b.temp[0] = 0 - for i, x := range b.store { - size := byte(0) - shifts := []byte{24, 16, 8, 0} - for _, shift := range shifts { - // Always writes at least one byte -- the first one (shift = 0) - // Will write more bytes until the rest of the integer is all zeroes - if (x>>shift) != 0 || shift == 0 { - size += 1 - b.temp[length] = byte(x >> shift) - length += 1 - } - } - // We store the size in two of the eight bits in the first byte (sizeByte) - // 0 means there is one byte in total, hence why we subtract one from size - b.temp[0] |= (size - 1) << (uint8(3-i) * 2) - } - // If we're flushing without a full group of four, remove the unused bytes we computed - // This enables us to realize it's a partial group on decoding thanks to EOF - if b.index != 4 { - length -= 4 - b.index - } - _, err := b.w.Write(b.temp[:length]) - return length, err -} - -func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) { - bytesWritten := 0 - b.store[b.index] = x - b.index += 1 - if b.index == 4 { - n, err := b.Flush() - if err != nil { - return n, err - } - bytesWritten += n - b.index = 0 - } - return bytesWritten, nil -} - -func (b *U32GroupVarintEncoder) Close() { - // On Close, we flush any remaining values that might not have been in a full group - b.Flush() -} - -/// - -type U32GroupVarintDecoder struct { - r io.ByteReader - group [4]uint32 - pos int - finished bool - capacity int -} - -func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder { - return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4} -} - -func (b *U32GroupVarintDecoder) getGroup() error { - // We should always receive a sizeByte if there are more values to read - sizeByte, err := b.r.ReadByte() - if err != nil { - return err - } - // Calculate the size of the four incoming 32 bit integers - // 0b00 means 1 byte to read, 0b01 = 2, etc - b.group[0] = uint32((sizeByte >> 6) & 3) - b.group[1] = uint32((sizeByte >> 4) & 3) - b.group[2] = uint32((sizeByte >> 2) & 3) - b.group[3] = uint32(sizeByte & 3) - // - for index, size := range b.group { - b.group[index] = 0 - // Any error that occurs in earlier byte reads should be repeated at the end one - // Hence we only catch and report the final ReadByte's error - var err error - switch size { - case 0: - var x byte - x, err = b.r.ReadByte() - b.group[index] = uint32(x) - case 1: - var x, y byte - x, _ = b.r.ReadByte() - y, err = b.r.ReadByte() - b.group[index] = uint32(x)<<8 | uint32(y) - case 2: - var x, y, z byte - x, _ = b.r.ReadByte() - y, _ = b.r.ReadByte() - z, err = b.r.ReadByte() - b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z) - case 3: - var x, y, z, zz byte - x, _ = b.r.ReadByte() - y, _ = b.r.ReadByte() - z, _ = b.r.ReadByte() - zz, err = b.r.ReadByte() - b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz) - } - if err != nil { - if err == io.EOF { - // If we hit EOF here, we have found a partial group - // We've return any valid entries we have read and return EOF once we run out - b.capacity = index - b.finished = true - break - } else { - return err - } - } - } - // Reset the pos pointer to the beginning of the read values - b.pos = 0 - return nil -} - -func (b *U32GroupVarintDecoder) GetU32() (uint32, error) { - // Check if we have any more values to give out - if not, let's get them - if b.pos == b.capacity { - // If finished is set, there is nothing else to do - if b.finished { - return 0, io.EOF - } - err := b.getGroup() - if err != nil { - return 0, err - } - } - // Increment pointer and return the value stored at that point - b.pos += 1 - return b.group[b.pos-1], nil -} - -/// - -type Base128Encoder struct { - w io.Writer - tmpBytes []byte -} - -func NewU32Base128Encoder(w io.Writer) *Base128Encoder { - return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)} -} -func NewU64Base128Encoder(w io.Writer) *Base128Encoder { - return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)} -} - -func (b *Base128Encoder) PutU32(x uint32) (int, error) { - writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x)) - return b.w.Write(b.tmpBytes[:writtenBytes]) -} - -func (b *Base128Encoder) PutU64(x uint64) (int, error) { - writtenBytes := binary.PutUvarint(b.tmpBytes, x) - return b.w.Write(b.tmpBytes[:writtenBytes]) -} - -func (b *Base128Encoder) Close() { -} - -/// - -type Base128Decoder struct { - r io.ByteReader -} - -func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } -func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } - -func (b *Base128Decoder) GetU32() (uint32, error) { - v, err := binary.ReadUvarint(b.r) - return uint32(v), err -} - -func (b *Base128Decoder) GetU64() (uint64, error) { - return binary.ReadUvarint(b.r) -} diff --git a/vendor/github.com/blevesearch/bleve/analysis/freq.go b/vendor/github.com/blevesearch/bleve/analysis/freq.go index e1ca2cd6f..198c149b2 100644 --- a/vendor/github.com/blevesearch/bleve/analysis/freq.go +++ b/vendor/github.com/blevesearch/bleve/analysis/freq.go @@ -14,6 +14,22 @@ package analysis +import ( + "reflect" + + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeTokenLocation int +var reflectStaticSizeTokenFreq int + +func init() { + var tl TokenLocation + reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size()) + var tf TokenFreq + reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size()) +} + // TokenLocation represents one occurrence of a term at a particular location in // a field. Start, End and Position have the same meaning as in analysis.Token. // Field and ArrayPositions identify the field value in the source document. @@ -26,6 +42,12 @@ type TokenLocation struct { Position int } +func (tl *TokenLocation) Size() int { + rv := reflectStaticSizeTokenLocation + rv += len(tl.ArrayPositions) * size.SizeOfUint64 + return rv +} + // TokenFreq represents all the occurrences of a term in all fields of a // document. type TokenFreq struct { @@ -34,6 +56,15 @@ type TokenFreq struct { frequency int } +func (tf *TokenFreq) Size() int { + rv := reflectStaticSizeTokenFreq + rv += len(tf.Term) + for _, loc := range tf.Locations { + rv += loc.Size() + } + return rv +} + func (tf *TokenFreq) Frequency() int { return tf.frequency } @@ -42,6 +73,16 @@ func (tf *TokenFreq) Frequency() int { // fields. type TokenFrequencies map[string]*TokenFreq +func (tfs TokenFrequencies) Size() int { + rv := size.SizeOfMap + rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr) + for k, v := range tfs { + rv += len(k) + rv += v.Size() + } + return rv +} + func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) { // walk the new token frequencies for tfk, tf := range other { diff --git a/vendor/github.com/blevesearch/bleve/analysis/token/camelcase/parser.go b/vendor/github.com/blevesearch/bleve/analysis/token/camelcase/parser.go index d691e5646..ff4ce2fea 100644 --- a/vendor/github.com/blevesearch/bleve/analysis/token/camelcase/parser.go +++ b/vendor/github.com/blevesearch/bleve/analysis/token/camelcase/parser.go @@ -46,11 +46,11 @@ type Parser struct { index int } -func NewParser(len, position, index int) *Parser { +func NewParser(length, position, index int) *Parser { return &Parser{ - bufferLen: len, - buffer: make([]rune, 0, len), - tokens: make([]*analysis.Token, 0, len), + bufferLen: length, + buffer: make([]rune, 0, length), + tokens: make([]*analysis.Token, 0, length), position: position, index: index, } diff --git a/vendor/github.com/blevesearch/bleve/analysis/token/unique/unique.go b/vendor/github.com/blevesearch/bleve/analysis/token/unique/unique.go index f0d96c504..c60e8c979 100644 --- a/vendor/github.com/blevesearch/bleve/analysis/token/unique/unique.go +++ b/vendor/github.com/blevesearch/bleve/analysis/token/unique/unique.go @@ -21,7 +21,7 @@ import ( const Name = "unique" -// UniqueTermFilter retains only the tokens which mark the first occurence of +// UniqueTermFilter retains only the tokens which mark the first occurrence of // a term. Tokens whose term appears in a preceding token are dropped. type UniqueTermFilter struct{} diff --git a/vendor/github.com/blevesearch/bleve/document/document.go b/vendor/github.com/blevesearch/bleve/document/document.go index c37585c66..6ac17b9ab 100644 --- a/vendor/github.com/blevesearch/bleve/document/document.go +++ b/vendor/github.com/blevesearch/bleve/document/document.go @@ -14,7 +14,19 @@ package document -import "fmt" +import ( + "fmt" + "reflect" + + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeDocument int + +func init() { + var d Document + reflectStaticSizeDocument = int(reflect.TypeOf(d).Size()) +} type Document struct { ID string `json:"id"` @@ -30,6 +42,21 @@ func NewDocument(id string) *Document { } } +func (d *Document) Size() int { + sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr + + len(d.ID) + + for _, entry := range d.Fields { + sizeInBytes += entry.Size() + } + + for _, entry := range d.CompositeFields { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + func (d *Document) AddField(f Field) *Document { switch f := f.(type) { case *CompositeField: diff --git a/vendor/github.com/blevesearch/bleve/document/field.go b/vendor/github.com/blevesearch/bleve/document/field.go index c17f81e5d..2fe916698 100644 --- a/vendor/github.com/blevesearch/bleve/document/field.go +++ b/vendor/github.com/blevesearch/bleve/document/field.go @@ -36,4 +36,6 @@ type Field interface { // that this field represents - this is a common metric for tracking // the rate of indexing NumPlainTextBytes() uint64 + + Size() int } diff --git a/vendor/github.com/blevesearch/bleve/document/field_boolean.go b/vendor/github.com/blevesearch/bleve/document/field_boolean.go index c226374c0..6864b16f4 100644 --- a/vendor/github.com/blevesearch/bleve/document/field_boolean.go +++ b/vendor/github.com/blevesearch/bleve/document/field_boolean.go @@ -16,10 +16,19 @@ package document import ( "fmt" + "reflect" "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeBooleanField int + +func init() { + var f BooleanField + reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size()) +} + const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues type BooleanField struct { @@ -30,6 +39,13 @@ type BooleanField struct { numPlainTextBytes uint64 } +func (b *BooleanField) Size() int { + return reflectStaticSizeBooleanField + size.SizeOfPtr + + len(b.name) + + len(b.arrayPositions)*size.SizeOfUint64 + + len(b.value) +} + func (b *BooleanField) Name() string { return b.name } diff --git a/vendor/github.com/blevesearch/bleve/document/field_composite.go b/vendor/github.com/blevesearch/bleve/document/field_composite.go index b41b1b8ed..a8285880f 100644 --- a/vendor/github.com/blevesearch/bleve/document/field_composite.go +++ b/vendor/github.com/blevesearch/bleve/document/field_composite.go @@ -15,9 +15,19 @@ package document import ( + "reflect" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeCompositeField int + +func init() { + var cf CompositeField + reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size()) +} + const DefaultCompositeIndexingOptions = IndexField type CompositeField struct { @@ -54,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl return rv } +func (c *CompositeField) Size() int { + sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr + + len(c.name) + + for k, _ := range c.includedFields { + sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool + } + + for k, _ := range c.excludedFields { + sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool + } + + return sizeInBytes +} + func (c *CompositeField) Name() string { return c.name } diff --git a/vendor/github.com/blevesearch/bleve/document/field_datetime.go b/vendor/github.com/blevesearch/bleve/document/field_datetime.go index 1db068c87..583b44cde 100644 --- a/vendor/github.com/blevesearch/bleve/document/field_datetime.go +++ b/vendor/github.com/blevesearch/bleve/document/field_datetime.go @@ -17,12 +17,21 @@ package document import ( "fmt" "math" + "reflect" "time" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/numeric" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDateTimeField int + +func init() { + var f DateTimeField + reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size()) +} + const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues const DefaultDateTimePrecisionStep uint = 4 @@ -37,6 +46,12 @@ type DateTimeField struct { numPlainTextBytes uint64 } +func (n *DateTimeField) Size() int { + return reflectStaticSizeDateTimeField + size.SizeOfPtr + + len(n.name) + + len(n.arrayPositions)*size.SizeOfUint64 +} + func (n *DateTimeField) Name() string { return n.name } diff --git a/vendor/github.com/blevesearch/bleve/document/field_geopoint.go b/vendor/github.com/blevesearch/bleve/document/field_geopoint.go index f508b3625..91fe23f96 100644 --- a/vendor/github.com/blevesearch/bleve/document/field_geopoint.go +++ b/vendor/github.com/blevesearch/bleve/document/field_geopoint.go @@ -16,12 +16,21 @@ package document import ( "fmt" + "reflect" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/geo" "github.com/blevesearch/bleve/numeric" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeGeoPointField int + +func init() { + var f GeoPointField + reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size()) +} + var GeoPrecisionStep uint = 9 type GeoPointField struct { @@ -32,6 +41,12 @@ type GeoPointField struct { numPlainTextBytes uint64 } +func (n *GeoPointField) Size() int { + return reflectStaticSizeGeoPointField + size.SizeOfPtr + + len(n.name) + + len(n.arrayPositions)*size.SizeOfUint64 +} + func (n *GeoPointField) Name() string { return n.name } diff --git a/vendor/github.com/blevesearch/bleve/document/field_numeric.go b/vendor/github.com/blevesearch/bleve/document/field_numeric.go index e32993c88..46c685e84 100644 --- a/vendor/github.com/blevesearch/bleve/document/field_numeric.go +++ b/vendor/github.com/blevesearch/bleve/document/field_numeric.go @@ -16,11 +16,20 @@ package document import ( "fmt" + "reflect" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/numeric" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeNumericField int + +func init() { + var f NumericField + reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size()) +} + const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues const DefaultPrecisionStep uint = 4 @@ -33,6 +42,12 @@ type NumericField struct { numPlainTextBytes uint64 } +func (n *NumericField) Size() int { + return reflectStaticSizeNumericField + size.SizeOfPtr + + len(n.name) + + len(n.arrayPositions)*size.SizeOfPtr +} + func (n *NumericField) Name() string { return n.name } diff --git a/vendor/github.com/blevesearch/bleve/document/field_text.go b/vendor/github.com/blevesearch/bleve/document/field_text.go index 5f7a3ab64..c8e871c9d 100644 --- a/vendor/github.com/blevesearch/bleve/document/field_text.go +++ b/vendor/github.com/blevesearch/bleve/document/field_text.go @@ -16,10 +16,19 @@ package document import ( "fmt" + "reflect" "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTextField int + +func init() { + var f TextField + reflectStaticSizeTextField = int(reflect.TypeOf(f).Size()) +} + const DefaultTextIndexingOptions = IndexField | DocValues type TextField struct { @@ -31,6 +40,13 @@ type TextField struct { numPlainTextBytes uint64 } +func (t *TextField) Size() int { + return reflectStaticSizeTextField + size.SizeOfPtr + + len(t.name) + + len(t.arrayPositions)*size.SizeOfUint64 + + len(t.value) +} + func (t *TextField) Name() string { return t.name } diff --git a/vendor/github.com/blevesearch/bleve/geo/geohash.go b/vendor/github.com/blevesearch/bleve/geo/geohash.go new file mode 100644 index 000000000..35db720c0 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/geo/geohash.go @@ -0,0 +1,174 @@ +// The code here was obtained from: +// https://github.com/mmcloughlin/geohash + +// The MIT License (MIT) +// Copyright (c) 2015 Michael McLoughlin +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package geo + +import ( + "math" +) + +// encoding encapsulates an encoding defined by a given base32 alphabet. +type encoding struct { + enc string + dec [256]byte +} + +// newEncoding constructs a new encoding defined by the given alphabet, +// which must be a 32-byte string. +func newEncoding(encoder string) *encoding { + e := new(encoding) + e.enc = encoder + for i := 0; i < len(e.dec); i++ { + e.dec[i] = 0xff + } + for i := 0; i < len(encoder); i++ { + e.dec[encoder[i]] = byte(i) + } + return e +} + +// Decode string into bits of a 64-bit word. The string s may be at most 12 +// characters. +func (e *encoding) decode(s string) uint64 { + x := uint64(0) + for i := 0; i < len(s); i++ { + x = (x << 5) | uint64(e.dec[s[i]]) + } + return x +} + +// Encode bits of 64-bit word into a string. +func (e *encoding) encode(x uint64) string { + b := [12]byte{} + for i := 0; i < 12; i++ { + b[11-i] = e.enc[x&0x1f] + x >>= 5 + } + return string(b[:]) +} + +// Base32Encoding with the Geohash alphabet. +var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz") + +// BoundingBox returns the region encoded by the given string geohash. +func geoBoundingBox(hash string) geoBox { + bits := uint(5 * len(hash)) + inthash := base32encoding.decode(hash) + return geoBoundingBoxIntWithPrecision(inthash, bits) +} + +// Box represents a rectangle in latitude/longitude space. +type geoBox struct { + minLat float64 + maxLat float64 + minLng float64 + maxLng float64 +} + +// Round returns a point inside the box, making an effort to round to minimal +// precision. +func (b geoBox) round() (lat, lng float64) { + x := maxDecimalPower(b.maxLat - b.minLat) + lat = math.Ceil(b.minLat/x) * x + x = maxDecimalPower(b.maxLng - b.minLng) + lng = math.Ceil(b.minLng/x) * x + return +} + +// precalculated for performance +var exp232 = math.Exp2(32) + +// errorWithPrecision returns the error range in latitude and longitude for in +// integer geohash with bits of precision. +func errorWithPrecision(bits uint) (latErr, lngErr float64) { + b := int(bits) + latBits := b / 2 + lngBits := b - latBits + latErr = math.Ldexp(180.0, -latBits) + lngErr = math.Ldexp(360.0, -lngBits) + return +} + +// minDecimalPlaces returns the minimum number of decimal places such that +// there must exist an number with that many places within any range of width +// r. This is intended for returning minimal precision coordinates inside a +// box. +func maxDecimalPower(r float64) float64 { + m := int(math.Floor(math.Log10(r))) + return math.Pow10(m) +} + +// Encode the position of x within the range -r to +r as a 32-bit integer. +func encodeRange(x, r float64) uint32 { + p := (x + r) / (2 * r) + return uint32(p * exp232) +} + +// Decode the 32-bit range encoding X back to a value in the range -r to +r. +func decodeRange(X uint32, r float64) float64 { + p := float64(X) / exp232 + x := 2*r*p - r + return x +} + +// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are +// ignored, and may take any value. +func squash(X uint64) uint32 { + X &= 0x5555555555555555 + X = (X | (X >> 1)) & 0x3333333333333333 + X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f + X = (X | (X >> 4)) & 0x00ff00ff00ff00ff + X = (X | (X >> 8)) & 0x0000ffff0000ffff + X = (X | (X >> 16)) & 0x00000000ffffffff + return uint32(X) +} + +// Deinterleave the bits of X into 32-bit words containing the even and odd +// bitlevels of X, respectively. +func deinterleave(X uint64) (uint32, uint32) { + return squash(X), squash(X >> 1) +} + +// BoundingBoxIntWithPrecision returns the region encoded by the integer +// geohash with the specified precision. +func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox { + fullHash := hash << (64 - bits) + latInt, lngInt := deinterleave(fullHash) + lat := decodeRange(latInt, 90) + lng := decodeRange(lngInt, 180) + latErr, lngErr := errorWithPrecision(bits) + return geoBox{ + minLat: lat, + maxLat: lat + latErr, + minLng: lng, + maxLng: lng + lngErr, + } +} + +// ---------------------------------------------------------------------- + +// Decode the string geohash to a (lat, lng) point. +func GeoHashDecode(hash string) (lat, lng float64) { + box := geoBoundingBox(hash) + return box.round() +} diff --git a/vendor/github.com/blevesearch/bleve/geo/parse.go b/vendor/github.com/blevesearch/bleve/geo/parse.go index 04a57538d..0511fea7b 100644 --- a/vendor/github.com/blevesearch/bleve/geo/parse.go +++ b/vendor/github.com/blevesearch/bleve/geo/parse.go @@ -16,6 +16,7 @@ package geo import ( "reflect" + "strconv" "strings" ) @@ -24,6 +25,8 @@ import ( // Container: // slice length 2 (GeoJSON) // first element lon, second element lat +// string (coordinates separated by comma, or a geohash) +// first element lat, second element lon // map[string]interface{} // exact keys lat and lon or lng // struct @@ -36,10 +39,14 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { var foundLon, foundLat bool thingVal := reflect.ValueOf(thing) + if !thingVal.IsValid() { + return lon, lat, false + } + thingTyp := thingVal.Type() // is it a slice - if thingVal.IsValid() && thingVal.Kind() == reflect.Slice { + if thingVal.Kind() == reflect.Slice { // must be length 2 if thingVal.Len() == 2 { first := thingVal.Index(0) @@ -55,6 +62,35 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { } } + // is it a string + if thingVal.Kind() == reflect.String { + geoStr := thingVal.Interface().(string) + if strings.Contains(geoStr, ",") { + // geo point with coordinates split by comma + points := strings.Split(geoStr, ",") + for i, point := range points { + // trim any leading or trailing white spaces + points[i] = strings.TrimSpace(point) + } + if len(points) == 2 { + var err error + lat, err = strconv.ParseFloat(points[0], 64) + if err == nil { + foundLat = true + } + lon, err = strconv.ParseFloat(points[1], 64) + if err == nil { + foundLon = true + } + } + } else { + // geohash + lat, lon = GeoHashDecode(geoStr) + foundLat = true + foundLon = true + } + } + // is it a map if l, ok := thing.(map[string]interface{}); ok { if lval, ok := l["lon"]; ok { @@ -68,7 +104,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { } // now try reflection on struct fields - if thingVal.IsValid() && thingVal.Kind() == reflect.Struct { + if thingVal.Kind() == reflect.Struct { for i := 0; i < thingVal.NumField(); i++ { fieldName := thingTyp.Field(i).Name if strings.HasPrefix(strings.ToLower(fieldName), "lon") { @@ -113,6 +149,9 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { // extract numeric value (if possible) and returns a float64 func extractNumericVal(v interface{}) (float64, bool) { val := reflect.ValueOf(v) + if !val.IsValid() { + return 0, false + } typ := val.Type() switch typ.Kind() { case reflect.Float32, reflect.Float64: diff --git a/vendor/github.com/blevesearch/bleve/index.go b/vendor/github.com/blevesearch/bleve/index.go index ea7b3832a..99357eee0 100644 --- a/vendor/github.com/blevesearch/bleve/index.go +++ b/vendor/github.com/blevesearch/bleve/index.go @@ -21,6 +21,7 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/mapping" + "github.com/blevesearch/bleve/size" ) // A Batch groups together multiple Index and Delete @@ -32,6 +33,9 @@ import ( type Batch struct { index Index internal *index.Batch + + lastDocSize uint64 + totalSize uint64 } // Index adds the specified index operation to the @@ -47,9 +51,22 @@ func (b *Batch) Index(id string, data interface{}) error { return err } b.internal.Update(doc) + + b.lastDocSize = uint64(doc.Size() + + len(id) + size.SizeOfString) // overhead from internal + b.totalSize += b.lastDocSize + return nil } +func (b *Batch) LastDocSize() uint64 { + return b.lastDocSize +} + +func (b *Batch) TotalDocsSize() uint64 { + return b.totalSize +} + // IndexAdvanced adds the specified index operation to the // batch which skips the mapping. NOTE: the bleve Index is not updated // until the batch is executed. @@ -102,6 +119,24 @@ func (b *Batch) Reset() { b.internal.Reset() } +func (b *Batch) Merge(o *Batch) { + if o != nil && o.internal != nil { + b.internal.Merge(o.internal) + if o.LastDocSize() > 0 { + b.lastDocSize = o.LastDocSize() + } + b.totalSize = uint64(b.internal.TotalDocSize()) + } +} + +func (b *Batch) SetPersistedCallback(f index.BatchCallback) { + b.internal.SetPersistedCallback(f) +} + +func (b *Batch) PersistedCallback() index.BatchCallback { + return b.internal.PersistedCallback() +} + // An Index implements all the indexing and searching // capabilities of bleve. An Index can be created // using the New() and Open() methods. diff --git a/vendor/github.com/blevesearch/bleve/index/analysis.go b/vendor/github.com/blevesearch/bleve/index/analysis.go index 840dad97a..82883af01 100644 --- a/vendor/github.com/blevesearch/bleve/index/analysis.go +++ b/vendor/github.com/blevesearch/bleve/index/analysis.go @@ -15,10 +15,20 @@ package index import ( + "reflect" + "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeAnalysisResult int + +func init() { + var ar AnalysisResult + reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size()) +} + type IndexRow interface { KeySize() int KeyTo([]byte) (int, error) @@ -39,6 +49,15 @@ type AnalysisResult struct { Length []int } +func (a *AnalysisResult) Size() int { + rv := reflectStaticSizeAnalysisResult + for _, analyzedI := range a.Analyzed { + rv += analyzedI.Size() + } + rv += len(a.Length) * size.SizeOfInt + return rv +} + type AnalysisWork struct { i Index d *document.Document diff --git a/vendor/github.com/blevesearch/bleve/index/index.go b/vendor/github.com/blevesearch/bleve/index/index.go index 9870b4172..6aa444cfd 100644 --- a/vendor/github.com/blevesearch/bleve/index/index.go +++ b/vendor/github.com/blevesearch/bleve/index/index.go @@ -18,11 +18,23 @@ import ( "bytes" "encoding/json" "fmt" + "reflect" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermFieldDoc int +var reflectStaticSizeTermFieldVector int + +func init() { + var tfd TermFieldDoc + reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size()) + var tfv TermFieldVector + reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size()) +} + var ErrorUnknownStorageType = fmt.Errorf("unknown storage type") type Index interface { @@ -68,6 +80,8 @@ type IndexReader interface { Document(id string) (*document.Document, error) DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error + DocValueReader(fields []string) (DocValueReader, error) + Fields() ([]string, error) GetInternal(key []byte) ([]byte, error) @@ -84,6 +98,29 @@ type IndexReader interface { Close() error } +// The Regexp interface defines the subset of the regexp.Regexp API +// methods that are used by bleve indexes, allowing callers to pass in +// alternate implementations. +type Regexp interface { + FindStringIndex(s string) (loc []int) + + LiteralPrefix() (prefix string, complete bool) + + String() string +} + +type IndexReaderRegexp interface { + FieldDictRegexp(field string, regex string) (FieldDict, error) +} + +type IndexReaderFuzzy interface { + FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error) +} + +type IndexReaderOnly interface { + FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error) +} + // FieldTerms contains the terms used by a document, keyed by field type FieldTerms map[string][]string @@ -115,6 +152,11 @@ type TermFieldVector struct { End uint64 } +func (tfv *TermFieldVector) Size() int { + return reflectStaticSizeTermFieldVector + size.SizeOfPtr + + len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64 +} + // IndexInternalID is an opaque document identifier interal to the index impl type IndexInternalID []byte @@ -134,14 +176,27 @@ type TermFieldDoc struct { Vectors []*TermFieldVector } +func (tfd *TermFieldDoc) Size() int { + sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr + + len(tfd.Term) + len(tfd.ID) + + for _, entry := range tfd.Vectors { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + // Reset allows an already allocated TermFieldDoc to be reused func (tfd *TermFieldDoc) Reset() *TermFieldDoc { // remember the []byte used for the ID id := tfd.ID + vectors := tfd.Vectors // idiom to copy over from empty TermFieldDoc (0 allocations) *tfd = TermFieldDoc{} // reuse the []byte already allocated (and reset len to 0) tfd.ID = id[:0] + tfd.Vectors = vectors[:0] return tfd } @@ -161,6 +216,8 @@ type TermFieldReader interface { // Count returns the number of documents contains the term in this field. Count() uint64 Close() error + + Size() int } type DictEntry struct { @@ -185,12 +242,18 @@ type DocIDReader interface { // will start there instead. If ID is greater than or equal to the end of // the range, Next() call will return io.EOF. Advance(ID IndexInternalID) (IndexInternalID, error) + + Size() int + Close() error } +type BatchCallback func(error) + type Batch struct { - IndexOps map[string]*document.Document - InternalOps map[string][]byte + IndexOps map[string]*document.Document + InternalOps map[string][]byte + persistedCallback BatchCallback } func NewBatch() *Batch { @@ -216,6 +279,14 @@ func (b *Batch) DeleteInternal(key []byte) { b.InternalOps[string(key)] = nil } +func (b *Batch) SetPersistedCallback(f BatchCallback) { + b.persistedCallback = f +} + +func (b *Batch) PersistedCallback() BatchCallback { + return b.persistedCallback +} + func (b *Batch) String() string { rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps)) for k, v := range b.IndexOps { @@ -238,4 +309,53 @@ func (b *Batch) String() string { func (b *Batch) Reset() { b.IndexOps = make(map[string]*document.Document) b.InternalOps = make(map[string][]byte) + b.persistedCallback = nil +} + +func (b *Batch) Merge(o *Batch) { + for k, v := range o.IndexOps { + b.IndexOps[k] = v + } + for k, v := range o.InternalOps { + b.InternalOps[k] = v + } +} + +func (b *Batch) TotalDocSize() int { + var s int + for k, v := range b.IndexOps { + if v != nil { + s += v.Size() + size.SizeOfString + } + s += len(k) + } + return s +} + +// Optimizable represents an optional interface that implementable by +// optimizable resources (e.g., TermFieldReaders, Searchers). These +// optimizable resources are provided the same OptimizableContext +// instance, so that they can coordinate via dynamic interface +// casting. +type Optimizable interface { + Optimize(kind string, octx OptimizableContext) (OptimizableContext, error) +} + +// Represents a result of optimization -- see the Finish() method. +type Optimized interface{} + +type OptimizableContext interface { + // Once all the optimzable resources have been provided the same + // OptimizableContext instance, the optimization preparations are + // finished or completed via the Finish() method. + // + // Depending on the optimization being performed, the Finish() + // method might return a non-nil Optimized instance. For example, + // the Optimized instance might represent an optimized + // TermFieldReader instance. + Finish() (Optimized, error) +} + +type DocValueReader interface { + VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/introducer.go b/vendor/github.com/blevesearch/bleve/index/scorch/introducer.go index 1a7d656ca..2d04bd38e 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/introducer.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/introducer.go @@ -19,7 +19,9 @@ import ( "sync/atomic" "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/index/scorch/segment/zap" ) type segmentIntroduction struct { @@ -29,8 +31,14 @@ type segmentIntroduction struct { ids []string internal map[string][]byte - applied chan error - persisted chan error + applied chan error + persisted chan error + persistedCallback index.BatchCallback +} + +type persistIntroduction struct { + persisted map[uint64]segment.Segment + applied notificationChan } type epochWatcher struct { @@ -48,6 +56,8 @@ func (s *Scorch) mainLoop() { var epochWatchers []*epochWatcher OUTER: for { + atomic.AddUint64(&s.stats.TotIntroduceLoop, 1) + select { case <-s.closeCh: break OUTER @@ -64,6 +74,9 @@ OUTER: continue OUTER } + case persist := <-s.persists: + s.introducePersist(persist) + case revertTo := <-s.revertToSnapshots: err := s.revertToSnapshot(revertTo) if err != nil { @@ -92,32 +105,38 @@ OUTER: } func (s *Scorch) introduceSegment(next *segmentIntroduction) error { - // acquire lock - s.rootLock.Lock() + atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1) + defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1) + + s.rootLock.RLock() + root := s.root + root.AddRef() + s.rootLock.RUnlock() + + defer func() { _ = root.DecRef() }() - nsegs := len(s.root.segment) + nsegs := len(root.segment) // prepare new index snapshot newSnapshot := &IndexSnapshot{ parent: s, segment: make([]*SegmentSnapshot, 0, nsegs+1), offsets: make([]uint64, 0, nsegs+1), - internal: make(map[string][]byte, len(s.root.internal)), - epoch: s.nextSnapshotEpoch, + internal: make(map[string][]byte, len(root.internal)), refs: 1, + creator: "introduceSegment", } - s.nextSnapshotEpoch++ // iterate through current segments var running uint64 - for i := range s.root.segment { + var docsToPersistCount, memSegments, fileSegments uint64 + for i := range root.segment { // see if optimistic work included this segment - delta, ok := next.obsoletes[s.root.segment[i].id] + delta, ok := next.obsoletes[root.segment[i].id] if !ok { var err error - delta, err = s.root.segment[i].segment.DocNumbers(next.ids) + delta, err = root.segment[i].segment.DocNumbers(next.ids) if err != nil { - s.rootLock.Unlock() next.applied <- fmt.Errorf("error computing doc numbers: %v", err) close(next.applied) _ = newSnapshot.DecRef() @@ -126,43 +145,60 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } newss := &SegmentSnapshot{ - id: s.root.segment[i].id, - segment: s.root.segment[i].segment, - cachedDocs: s.root.segment[i].cachedDocs, + id: root.segment[i].id, + segment: root.segment[i].segment, + cachedDocs: root.segment[i].cachedDocs, + creator: root.segment[i].creator, } // apply new obsoletions - if s.root.segment[i].deleted == nil { + if root.segment[i].deleted == nil { newss.deleted = delta } else { - newss.deleted = roaring.Or(s.root.segment[i].deleted, delta) + newss.deleted = roaring.Or(root.segment[i].deleted, delta) + } + if newss.deleted.IsEmpty() { + newss.deleted = nil } // check for live size before copying if newss.LiveSize() > 0 { newSnapshot.segment = append(newSnapshot.segment, newss) - s.root.segment[i].segment.AddRef() + root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) - running += s.root.segment[i].Count() + running += newss.segment.Count() + } + + if isMemorySegment(root.segment[i]) { + docsToPersistCount += root.segment[i].Count() + memSegments++ + } else { + fileSegments++ } } + atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) + // append new segment, if any, to end of the new index snapshot if next.data != nil { newSegmentSnapshot := &SegmentSnapshot{ id: next.id, segment: next.data, // take ownership of next.data's ref-count cachedDocs: &cachedDocs{cache: nil}, + creator: "introduceSegment", } newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot) newSnapshot.offsets = append(newSnapshot.offsets, running) // increment numItemsIntroduced which tracks the number of items // queued for persistence. - atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count()) + atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count()) + atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1) } // copy old values - for key, oldVal := range s.root.internal { + for key, oldVal := range root.internal { newSnapshot.internal[key] = oldVal } // set new values and apply deletes @@ -173,12 +209,21 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { delete(newSnapshot.internal, key) } } + + newSnapshot.updateSize() + s.rootLock.Lock() if next.persisted != nil { s.rootPersisted = append(s.rootPersisted, next.persisted) } + if next.persistedCallback != nil { + s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback) + } // swap in new index snapshot + newSnapshot.epoch = s.nextSnapshotEpoch + s.nextSnapshotEpoch++ rootPrev := s.root s.root = newSnapshot + atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) // release lock s.rootLock.Unlock() @@ -191,42 +236,113 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { return nil } -func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { - // acquire lock +func (s *Scorch) introducePersist(persist *persistIntroduction) { + atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1) + defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1) + s.rootLock.Lock() + root := s.root + root.AddRef() + nextSnapshotEpoch := s.nextSnapshotEpoch + s.nextSnapshotEpoch++ + s.rootLock.Unlock() - // prepare new index snapshot - currSize := len(s.root.segment) - newSize := currSize + 1 - len(nextMerge.old) + defer func() { _ = root.DecRef() }() + + newIndexSnapshot := &IndexSnapshot{ + parent: s, + epoch: nextSnapshotEpoch, + segment: make([]*SegmentSnapshot, len(root.segment)), + offsets: make([]uint64, len(root.offsets)), + internal: make(map[string][]byte, len(root.internal)), + refs: 1, + creator: "introducePersist", + } + + var docsToPersistCount, memSegments, fileSegments uint64 + for i, segmentSnapshot := range root.segment { + // see if this segment has been replaced + if replacement, ok := persist.persisted[segmentSnapshot.id]; ok { + newSegmentSnapshot := &SegmentSnapshot{ + id: segmentSnapshot.id, + segment: replacement, + deleted: segmentSnapshot.deleted, + cachedDocs: segmentSnapshot.cachedDocs, + creator: "introducePersist", + } + newIndexSnapshot.segment[i] = newSegmentSnapshot + delete(persist.persisted, segmentSnapshot.id) + + // update items persisted incase of a new segment snapshot + atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count()) + atomic.AddUint64(&s.stats.TotPersistedSegments, 1) + fileSegments++ + } else { + newIndexSnapshot.segment[i] = root.segment[i] + newIndexSnapshot.segment[i].segment.AddRef() + + if isMemorySegment(root.segment[i]) { + docsToPersistCount += root.segment[i].Count() + memSegments++ + } else { + fileSegments++ + } + } + newIndexSnapshot.offsets[i] = root.offsets[i] + } + + for k, v := range root.internal { + newIndexSnapshot.internal[k] = v + } + + atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) + newIndexSnapshot.updateSize() + s.rootLock.Lock() + rootPrev := s.root + s.root = newIndexSnapshot + atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) + s.rootLock.Unlock() - // empty segments deletion - if nextMerge.new == nil { - newSize-- + if rootPrev != nil { + _ = rootPrev.DecRef() } + close(persist.applied) +} + +func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { + atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1) + defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1) + + s.rootLock.RLock() + root := s.root + root.AddRef() + s.rootLock.RUnlock() + + defer func() { _ = root.DecRef() }() + newSnapshot := &IndexSnapshot{ parent: s, - segment: make([]*SegmentSnapshot, 0, newSize), - offsets: make([]uint64, 0, newSize), - internal: s.root.internal, - epoch: s.nextSnapshotEpoch, + internal: root.internal, refs: 1, + creator: "introduceMerge", } - s.nextSnapshotEpoch++ // iterate through current segments newSegmentDeleted := roaring.NewBitmap() - var running uint64 - for i := range s.root.segment { - segmentID := s.root.segment[i].id + var running, docsToPersistCount, memSegments, fileSegments uint64 + for i := range root.segment { + segmentID := root.segment[i].id if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { // this segment is going away, see if anything else was deleted since we started the merge - if segSnapAtMerge != nil && s.root.segment[i].deleted != nil { + if segSnapAtMerge != nil && root.segment[i].deleted != nil { // assume all these deletes are new - deletedSince := s.root.segment[i].deleted + deletedSince := root.segment[i].deleted // if we already knew about some of them, remove if segSnapAtMerge.deleted != nil { - deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted) + deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted) } deletedSinceItr := deletedSince.Iterator() for deletedSinceItr.HasNext() { @@ -240,18 +356,25 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // segments left behind in old map after processing // the root segments would be the obsolete segment set delete(nextMerge.old, segmentID) - - } else if s.root.segment[i].LiveSize() > 0 { + } else if root.segment[i].LiveSize() > 0 { // this segment is staying newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ - id: s.root.segment[i].id, - segment: s.root.segment[i].segment, - deleted: s.root.segment[i].deleted, - cachedDocs: s.root.segment[i].cachedDocs, + id: root.segment[i].id, + segment: root.segment[i].segment, + deleted: root.segment[i].deleted, + cachedDocs: root.segment[i].cachedDocs, + creator: root.segment[i].creator, }) - s.root.segment[i].segment.AddRef() + root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) - running += s.root.segment[i].Count() + running += root.segment[i].segment.Count() + + if isMemorySegment(root.segment[i]) { + docsToPersistCount += root.segment[i].Count() + memSegments++ + } else { + fileSegments++ + } } } @@ -269,6 +392,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } } + // In case where all the docs in the newly merged segment getting // deleted by the time we reach here, can skip the introduction. if nextMerge.new != nil && @@ -279,15 +403,35 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { segment: nextMerge.new, // take ownership for nextMerge.new's ref-count deleted: newSegmentDeleted, cachedDocs: &cachedDocs{cache: nil}, + creator: "introduceMerge", }) newSnapshot.offsets = append(newSnapshot.offsets, running) + atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1) + + switch nextMerge.new.(type) { + case *zap.SegmentBase: + docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() + memSegments++ + case *zap.Segment: + fileSegments++ + } } + atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) + newSnapshot.AddRef() // 1 ref for the nextMerge.notify response - // swap in new segment + newSnapshot.updateSize() + + s.rootLock.Lock() + // swap in new index snapshot + newSnapshot.epoch = s.nextSnapshotEpoch + s.nextSnapshotEpoch++ rootPrev := s.root s.root = newSnapshot + atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) // release lock s.rootLock.Unlock() @@ -301,6 +445,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { + atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1) + defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1) + if revertTo.snapshot == nil { err := fmt.Errorf("Cannot revert to a nil snapshot") revertTo.applied <- err @@ -318,9 +465,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { internal: revertTo.snapshot.internal, epoch: s.nextSnapshotEpoch, refs: 1, + creator: "revertToSnapshot", } s.nextSnapshotEpoch++ + var docsToPersistCount, memSegments, fileSegments uint64 // iterate through segments for i, segmentSnapshot := range revertTo.snapshot.segment { newSnapshot.segment[i] = &SegmentSnapshot{ @@ -328,21 +477,37 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { segment: segmentSnapshot.segment, deleted: segmentSnapshot.deleted, cachedDocs: segmentSnapshot.cachedDocs, + creator: segmentSnapshot.creator, } newSnapshot.segment[i].segment.AddRef() // remove segment from ineligibleForRemoval map filename := zapFileName(segmentSnapshot.id) delete(s.ineligibleForRemoval, filename) + + if isMemorySegment(segmentSnapshot) { + docsToPersistCount += segmentSnapshot.Count() + memSegments++ + } else { + fileSegments++ + } } + atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) + if revertTo.persisted != nil { s.rootPersisted = append(s.rootPersisted, revertTo.persisted) } + newSnapshot.updateSize() + // swap in new snapshot rootPrev := s.root s.root = newSnapshot + + atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) // release lock s.rootLock.Unlock() @@ -354,3 +519,12 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { return nil } + +func isMemorySegment(s *SegmentSnapshot) bool { + switch s.segment.(type) { + case *zap.SegmentBase: + return true + default: + return false + } +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/merge.go b/vendor/github.com/blevesearch/bleve/index/scorch/merge.go index ad756588a..bcbf5b710 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/merge.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/merge.go @@ -15,9 +15,7 @@ package scorch import ( - "bytes" "encoding/json" - "fmt" "os" "sync/atomic" @@ -40,16 +38,20 @@ func (s *Scorch) mergerLoop() { OUTER: for { + atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1) + select { case <-s.closeCh: break OUTER default: // check to see if there is a new snapshot to persist - s.rootLock.RLock() + s.rootLock.Lock() ourSnapshot := s.root ourSnapshot.AddRef() - s.rootLock.RUnlock() + atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size())) + atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch) + s.rootLock.Unlock() if ourSnapshot.epoch != lastEpochMergePlanned { startTime := time.Now() @@ -57,12 +59,21 @@ OUTER: // lets get started err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) if err != nil { + atomic.StoreUint64(&s.iStats.mergeEpoch, 0) + if err == segment.ErrClosed { + // index has been closed + _ = ourSnapshot.DecRef() + break OUTER + } s.fireAsyncError(fmt.Errorf("merging err: %v", err)) _ = ourSnapshot.DecRef() + atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1) continue OUTER } lastEpochMergePlanned = ourSnapshot.epoch + atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch) + s.fireEvent(EventKindMergerProgress, time.Since(startTime)) } _ = ourSnapshot.DecRef() @@ -88,7 +99,10 @@ OUTER: case <-ew.notifyCh: } } + + atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1) } + s.asyncTasks.Done() } @@ -105,6 +119,11 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, if err != nil { return &mergePlannerOptions, err } + + err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions) + if err != nil { + return nil, err + } } return &mergePlannerOptions, nil } @@ -119,32 +138,45 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, } } + atomic.AddUint64(&s.stats.TotFileMergePlan, 1) + // give this list to the planner resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options) if err != nil { + atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) return fmt.Errorf("merge planning err: %v", err) } if resultMergePlan == nil { // nothing to do + atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1) return nil } + atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1) + + atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks))) + // process tasks in serial for now var notifications []chan *IndexSnapshot for _, task := range resultMergePlan.Tasks { if len(task.Segments) == 0 { + atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) continue } + atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments))) + oldMap := make(map[uint64]*SegmentSnapshot) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments)) docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) + for _, planSegment := range task.Segments { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { oldMap[segSnapshot.id] = segSnapshot if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok { if segSnapshot.LiveSize() == 0 { + atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1) oldMap[segSnapshot.id] = nil } else { segmentsToMerge = append(segmentsToMerge, zapSeg) @@ -155,32 +187,53 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, } var oldNewDocNums map[uint64][]uint64 - var segment segment.Segment + var seg segment.Segment if len(segmentsToMerge) > 0 { filename := zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename - newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) + + fileMergeZapStartTime := time.Now() + + atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) + newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path, + DefaultChunkFactor, s.closeCh, s) + atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) + + fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) + atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime) + if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime { + atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime) + } + if err != nil { s.unmarkIneligibleForRemoval(filename) + atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) + if err == segment.ErrClosed { + return err + } return fmt.Errorf("merging failed: %v", err) } - segment, err = zap.Open(path) + + seg, err = zap.Open(path) if err != nil { s.unmarkIneligibleForRemoval(filename) + atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) return err } oldNewDocNums = make(map[uint64][]uint64) for i, segNewDocNums := range newDocNums { oldNewDocNums[task.Segments[i].Id()] = segNewDocNums } + + atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge))) } sm := &segmentMerge{ id: newSegmentID, old: oldMap, oldNewDocNums: oldNewDocNums, - new: segment, + new: seg, notify: make(chan *IndexSnapshot, 1), } notifications = append(notifications, sm.notify) @@ -188,21 +241,28 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, // give it to the introducer select { case <-s.closeCh: - _ = segment.Close() - return nil + _ = seg.Close() + return segment.ErrClosed case s.merges <- sm: + atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1) } + + atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) } + for _, notification := range notifications { select { case <-s.closeCh: - return nil + atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1) + return segment.ErrClosed case newSnapshot := <-notification: + atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) if newSnapshot != nil { _ = newSnapshot.DecRef() } } } + return nil } @@ -219,44 +279,48 @@ type segmentMerge struct { // into the root func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, - chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) { - var br bytes.Buffer + chunkFactor uint32) (*IndexSnapshot, uint64, error) { + atomic.AddUint64(&s.stats.TotMemMergeBeg, 1) - cr := zap.NewCountHashWriter(&br) + memMergeZapStartTime := time.Now() - newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, - docValueOffset, dictLocs, fieldsInv, fieldsMap, err := - zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) - if err != nil { - return 0, nil, 0, err - } - - sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, - fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset, - docValueOffset, dictLocs) - if err != nil { - return 0, nil, 0, err - } + atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) - filename := zapFileName(newSegmentID) path := s.path + string(os.PathSeparator) + filename - err = zap.PersistSegmentBase(sb, path) + + newDocNums, _, err := + zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s) + + atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) + + memMergeZapTime := uint64(time.Since(memMergeZapStartTime)) + atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime) + if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime { + atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime) + } + if err != nil { - return 0, nil, 0, err + atomic.AddUint64(&s.stats.TotMemMergeErr, 1) + return nil, 0, err } - segment, err := zap.Open(path) + seg, err := zap.Open(path) if err != nil { - return 0, nil, 0, err + atomic.AddUint64(&s.stats.TotMemMergeErr, 1) + return nil, 0, err } + // update persisted stats + atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count()) + atomic.AddUint64(&s.stats.TotPersistedSegments, 1) + sm := &segmentMerge{ id: newSegmentID, old: make(map[uint64]*SegmentSnapshot), oldNewDocNums: make(map[uint64][]uint64), - new: segment, + new: seg, notify: make(chan *IndexSnapshot, 1), } @@ -268,15 +332,21 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, select { // send to introducer case <-s.closeCh: - _ = segment.DecRef() - return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? + _ = seg.DecRef() + return nil, 0, segment.ErrClosed case s.merges <- sm: } select { // wait for introduction to complete case <-s.closeCh: - return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? + return nil, 0, segment.ErrClosed case newSnapshot := <-sm.notify: - return numDocs, newSnapshot, newSegmentID, nil + atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) + atomic.AddUint64(&s.stats.TotMemMergeDone, 1) + return newSnapshot, newSegmentID, nil } } + +func (s *Scorch) ReportBytesWritten(bytesWritten uint64) { + atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten) +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go b/vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go index 62f643f43..c2a0d3c64 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go @@ -18,6 +18,7 @@ package mergeplan import ( + "errors" "fmt" "math" "sort" @@ -115,7 +116,15 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 { return o.FloorSegmentSize } -// Suggested default options. +// MaxSegmentSizeLimit represents the maximum size of a segment, +// this limit comes with hit-1 optimisation/max encoding limit uint31. +const MaxSegmentSizeLimit = 1<<31 - 1 + +// ErrMaxSegmentSizeTooLarge is returned when the size of the segment +// exceeds the MaxSegmentSizeLimit +var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit") + +// DefaultMergePlanOptions suggests the default options. var DefaultMergePlanOptions = MergePlanOptions{ MaxSegmentsPerTier: 10, MaxSegmentSize: 5000000, @@ -208,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { if len(roster) > 0 { rosterScore := scoreSegments(roster, o) - if len(bestRoster) <= 0 || rosterScore < bestRosterScore { + if len(bestRoster) == 0 || rosterScore < bestRosterScore { bestRoster = roster bestRosterScore = rosterScore } } } - if len(bestRoster) <= 0 { + if len(bestRoster) == 0 { return rv, nil } @@ -367,3 +376,11 @@ func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan) return strings.Join(rv, "\n") } + +// ValidateMergePlannerOptions validates the merge planner options +func ValidateMergePlannerOptions(options *MergePlanOptions) error { + if options.MaxSegmentSize > MaxSegmentSizeLimit { + return ErrMaxSegmentSizeTooLarge + } + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/optimize.go b/vendor/github.com/blevesearch/bleve/index/scorch/optimize.go new file mode 100644 index 000000000..b33e3be3d --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/index/scorch/optimize.go @@ -0,0 +1,420 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + + "github.com/RoaringBitmap/roaring" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/index/scorch/segment/zap" +) + +var OptimizeConjunction = true +var OptimizeConjunctionUnadorned = true +var OptimizeDisjunctionUnadorned = true + +func (s *IndexSnapshotTermFieldReader) Optimize(kind string, + octx index.OptimizableContext) (index.OptimizableContext, error) { + if OptimizeConjunction && kind == "conjunction" { + return s.optimizeConjunction(octx) + } + + if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" { + return s.optimizeConjunctionUnadorned(octx) + } + + if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" { + return s.optimizeDisjunctionUnadorned(octx) + } + + return octx, nil +} + +var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256) + +// ---------------------------------------------------------------- + +func (s *IndexSnapshotTermFieldReader) optimizeConjunction( + octx index.OptimizableContext) (index.OptimizableContext, error) { + if octx == nil { + octx = &OptimizeTFRConjunction{snapshot: s.snapshot} + } + + o, ok := octx.(*OptimizeTFRConjunction) + if !ok { + return octx, nil + } + + if o.snapshot != s.snapshot { + return nil, fmt.Errorf("tried to optimize conjunction across different snapshots") + } + + o.tfrs = append(o.tfrs, s) + + return o, nil +} + +type OptimizeTFRConjunction struct { + snapshot *IndexSnapshot + + tfrs []*IndexSnapshotTermFieldReader +} + +func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { + if len(o.tfrs) <= 1 { + return nil, nil + } + + for i := range o.snapshot.segment { + itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator) + if !ok || itr0.ActualBM == nil { + continue + } + + itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator) + if !ok || itr1.ActualBM == nil { + continue + } + + bm := roaring.And(itr0.ActualBM, itr1.ActualBM) + + for _, tfr := range o.tfrs[2:] { + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if !ok || itr.ActualBM == nil { + continue + } + + bm.And(itr.ActualBM) + } + + // in this conjunction optimization, the postings iterators + // will all share the same AND'ed together actual bitmap. The + // regular conjunction searcher machinery will still be used, + // but the underlying bitmap will be smaller. + for _, tfr := range o.tfrs { + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if ok && itr.ActualBM != nil { + itr.ActualBM = bm + itr.Actual = bm.Iterator() + } + } + } + + return nil, nil +} + +// ---------------------------------------------------------------- + +// An "unadorned" conjunction optimization is appropriate when +// additional or subsidiary information like freq-norm's and +// term-vectors are not required, and instead only the internal-id's +// are needed. +func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned( + octx index.OptimizableContext) (index.OptimizableContext, error) { + if octx == nil { + octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot} + } + + o, ok := octx.(*OptimizeTFRConjunctionUnadorned) + if !ok { + return nil, nil + } + + if o.snapshot != s.snapshot { + return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots") + } + + o.tfrs = append(o.tfrs, s) + + return o, nil +} + +type OptimizeTFRConjunctionUnadorned struct { + snapshot *IndexSnapshot + + tfrs []*IndexSnapshotTermFieldReader +} + +var OptimizeTFRConjunctionUnadornedTerm = []byte("") +var OptimizeTFRConjunctionUnadornedField = "*" + +// Finish of an unadorned conjunction optimization will compute a +// termFieldReader with an "actual" bitmap that represents the +// constituent bitmaps AND'ed together. This termFieldReader cannot +// provide any freq-norm or termVector associated information. +func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) { + if len(o.tfrs) <= 1 { + return nil, nil + } + + // We use an artificial term and field because the optimized + // termFieldReader can represent multiple terms and fields. + oTFR := &IndexSnapshotTermFieldReader{ + term: OptimizeTFRConjunctionUnadornedTerm, + field: OptimizeTFRConjunctionUnadornedField, + snapshot: o.snapshot, + iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), + segmentOffset: 0, + includeFreq: false, + includeNorm: false, + includeTermVectors: false, + } + + var actualBMs []*roaring.Bitmap // Collected from regular posting lists. + +OUTER: + for i := range o.snapshot.segment { + actualBMs = actualBMs[:0] + + var docNum1HitLast uint64 + var docNum1HitLastOk bool + + for _, tfr := range o.tfrs { + if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok { + // An empty postings iterator means the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if !ok { + // We optimize zap postings iterators only. + return nil, nil + } + + // If the postings iterator is "1-hit" optimized, then we + // can perform several optimizations up-front here. + docNum1Hit, ok := itr.DocNum1Hit() + if ok { + if docNum1Hit == zap.DocNum1HitFinished { + // An empty docNum here means the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + if docNum1HitLastOk && docNum1HitLast != docNum1Hit { + // The docNum1Hit doesn't match the previous + // docNum1HitLast, so the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + docNum1HitLast = docNum1Hit + docNum1HitLastOk = true + + continue + } + + if itr.ActualBM == nil { + // An empty actual bitmap means the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + // Collect the actual bitmap for more processing later. + actualBMs = append(actualBMs, itr.ActualBM) + } + + if docNum1HitLastOk { + // We reach here if all the 1-hit optimized posting + // iterators had the same 1-hit docNum, so we can check if + // our collected actual bitmaps also have that docNum. + for _, bm := range actualBMs { + if !bm.Contains(uint32(docNum1HitLast)) { + // The docNum1Hit isn't in one of our actual + // bitmaps, so the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + } + + // The actual bitmaps and docNum1Hits all contain or have + // the same 1-hit docNum, so that's our AND'ed result. + oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit( + docNum1HitLast, zap.NormBits1Hit, false, false) + if err != nil { + return nil, nil + } + + continue OUTER + } + + if len(actualBMs) == 0 { + // If we've collected no actual bitmaps at this point, + // then the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + if len(actualBMs) == 1 { + // If we've only 1 actual bitmap, then that's our result. + oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( + actualBMs[0], false, false) + if err != nil { + return nil, nil + } + + continue OUTER + } + + // Else, AND together our collected bitmaps as our result. + bm := roaring.And(actualBMs[0], actualBMs[1]) + + for _, actualBM := range actualBMs[2:] { + bm.And(actualBM) + } + + oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( + bm, false, false) + if err != nil { + return nil, nil + } + } + + return oTFR, nil +} + +// ---------------------------------------------------------------- + +// An "unadorned" disjunction optimization is appropriate when +// additional or subsidiary information like freq-norm's and +// term-vectors are not required, and instead only the internal-id's +// are needed. +func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( + octx index.OptimizableContext) (index.OptimizableContext, error) { + if octx == nil { + octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot} + } + + o, ok := octx.(*OptimizeTFRDisjunctionUnadorned) + if !ok { + return nil, nil + } + + if o.snapshot != s.snapshot { + return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots") + } + + o.tfrs = append(o.tfrs, s) + + return o, nil +} + +type OptimizeTFRDisjunctionUnadorned struct { + snapshot *IndexSnapshot + + tfrs []*IndexSnapshotTermFieldReader +} + +var OptimizeTFRDisjunctionUnadornedTerm = []byte("") +var OptimizeTFRDisjunctionUnadornedField = "*" + +// Finish of an unadorned disjunction optimization will compute a +// termFieldReader with an "actual" bitmap that represents the +// constituent bitmaps OR'ed together. This termFieldReader cannot +// provide any freq-norm or termVector associated information. +func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) { + if len(o.tfrs) <= 1 { + return nil, nil + } + + for i := range o.snapshot.segment { + var cMax uint64 + + for _, tfr := range o.tfrs { + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if !ok { + return nil, nil + } + + if itr.ActualBM != nil { + c := itr.ActualBM.GetCardinality() + if cMax < c { + cMax = c + } + } + } + + // Heuristic to skip the optimization if all the constituent + // bitmaps are too small, where the processing & resource + // overhead to create the OR'ed bitmap outweighs the benefit. + if cMax < OptimizeDisjunctionUnadornedMinChildCardinality { + return nil, nil + } + } + + // We use an artificial term and field because the optimized + // termFieldReader can represent multiple terms and fields. + oTFR := &IndexSnapshotTermFieldReader{ + term: OptimizeTFRDisjunctionUnadornedTerm, + field: OptimizeTFRDisjunctionUnadornedField, + snapshot: o.snapshot, + iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), + segmentOffset: 0, + includeFreq: false, + includeNorm: false, + includeTermVectors: false, + } + + var docNums []uint32 // Collected docNum's from 1-hit posting lists. + var actualBMs []*roaring.Bitmap // Collected from regular posting lists. + + for i := range o.snapshot.segment { + docNums = docNums[:0] + actualBMs = actualBMs[:0] + + for _, tfr := range o.tfrs { + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if !ok { + return nil, nil + } + + docNum, ok := itr.DocNum1Hit() + if ok { + docNums = append(docNums, uint32(docNum)) + continue + } + + if itr.ActualBM != nil { + actualBMs = append(actualBMs, itr.ActualBM) + } + } + + var bm *roaring.Bitmap + if len(actualBMs) > 2 { + bm = roaring.HeapOr(actualBMs...) + } else if len(actualBMs) == 2 { + bm = roaring.Or(actualBMs[0], actualBMs[1]) + } else if len(actualBMs) == 1 { + bm = actualBMs[0].Clone() + } + + if bm == nil { + bm = roaring.New() + } + + bm.AddMany(docNums) + + oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false) + if err != nil { + return nil, nil + } + } + + return oTFR, nil +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/persister.go b/vendor/github.com/blevesearch/bleve/index/scorch/persister.go index c21bb1439..349ccdc0e 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/persister.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/persister.go @@ -16,9 +16,12 @@ package scorch import ( "bytes" + "encoding/binary" + "encoding/json" "fmt" "io/ioutil" "log" + "math" "os" "path/filepath" "strconv" @@ -27,23 +30,57 @@ import ( "time" "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment/zap" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) var DefaultChunkFactor uint32 = 1024 -// Arbitrary number, need to make it configurable. -// Lower values like 10/making persister really slow -// doesn't work well as it is creating more files to -// persist for in next persist iteration and spikes the # FDs. -// Ideal value should let persister also proceed at -// an optimum pace so that the merger can skip -// many intermediate snapshots. -// This needs to be based on empirical data. -// TODO - may need to revisit this approach/value. -var epochDistance = uint64(5) +// DefaultPersisterNapTimeMSec is kept to zero as this helps in direct +// persistence of segments with the default safe batch option. +// If the default safe batch option results in high number of +// files on disk, then users may initialise this configuration parameter +// with higher values so that the persister will nap a bit within it's +// work loop to favour better in-memory merging of segments to result +// in fewer segment files on disk. But that may come with an indexing +// performance overhead. +// Unsafe batch users are advised to override this to higher value +// for better performance especially with high data density. +var DefaultPersisterNapTimeMSec int = 0 // ms + +// DefaultPersisterNapUnderNumFiles helps in controlling the pace of +// persister. At times of a slow merger progress with heavy file merging +// operations, its better to pace down the persister for letting the merger +// to catch up within a range defined by this parameter. +// Fewer files on disk (as per the merge plan) would result in keeping the +// file handle usage under limit, faster disk merger and a healthier index. +// Its been observed that such a loosely sync'ed introducer-persister-merger +// trio results in better overall performance. +var DefaultPersisterNapUnderNumFiles int = 1000 + +var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64 + +type persisterOptions struct { + // PersisterNapTimeMSec controls the wait/delay injected into + // persistence workloop to improve the chances for + // a healthier and heavier in-memory merging + PersisterNapTimeMSec int + + // PersisterNapTimeMSec > 0, and the number of files is less than + // PersisterNapUnderNumFiles, then the persister will sleep + // PersisterNapTimeMSec amount of time to improve the chances for + // a healthier and heavier in-memory merging + PersisterNapUnderNumFiles int + + // MemoryPressurePauseThreshold let persister to have a better leeway + // for prudently performing the memory merge of segments on a memory + // pressure situation. Here the config value is an upper threshold + // for the number of paused application threads. The default value would + // be a very high number to always favour the merging of memory segments. + MemoryPressurePauseThreshold uint64 +} type notificationChan chan struct{} @@ -53,8 +90,17 @@ func (s *Scorch) persisterLoop() { var persistWatchers []*epochWatcher var lastPersistedEpoch, lastMergedEpoch uint64 var ew *epochWatcher + po, err := s.parsePersisterOptions() + if err != nil { + s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err)) + s.asyncTasks.Done() + return + } + OUTER: for { + atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1) + select { case <-s.closeCh: break OUTER @@ -65,11 +111,13 @@ OUTER: if ew != nil && ew.epoch > lastMergedEpoch { lastMergedEpoch = ew.epoch } - persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, - &lastMergedEpoch, persistWatchers) + + lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, + lastMergedEpoch, persistWatchers, po) var ourSnapshot *IndexSnapshot var ourPersisted []chan error + var ourPersistedCallbacks []index.BatchCallback // check to see if there is a new snapshot to persist s.rootLock.Lock() @@ -78,13 +126,17 @@ OUTER: ourSnapshot.AddRef() ourPersisted = s.rootPersisted s.rootPersisted = nil + ourPersistedCallbacks = s.persistedCallbacks + s.persistedCallbacks = nil + atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size())) + atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch) } s.rootLock.Unlock() if ourSnapshot != nil { startTime := time.Now() - err := s.persistSnapshot(ourSnapshot) + err := s.persistSnapshot(ourSnapshot, po) for _, ch := range ourPersisted { if err != nil { ch <- err @@ -92,10 +144,22 @@ OUTER: close(ch) } if err != nil { + atomic.StoreUint64(&s.iStats.persistEpoch, 0) + if err == segment.ErrClosed { + // index has been closed + _ = ourSnapshot.DecRef() + break OUTER + } s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) _ = ourSnapshot.DecRef() + atomic.AddUint64(&s.stats.TotPersistLoopErr, 1) continue OUTER } + for i := range ourPersistedCallbacks { + ourPersistedCallbacks[i](err) + } + + atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch) lastPersistedEpoch = ourSnapshot.epoch for _, ew := range persistWatchers { @@ -115,6 +179,8 @@ OUTER: s.fireEvent(EventKindPersisterProgress, time.Since(startTime)) if changed { + s.removeOldData() + atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1) continue OUTER } } @@ -133,17 +199,21 @@ OUTER: s.removeOldData() // might as well cleanup while waiting + atomic.AddUint64(&s.stats.TotPersistLoopWait, 1) + select { case <-s.closeCh: break OUTER case <-w.notifyCh: // woken up, next loop should pick up work - continue OUTER + atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1) case ew = <-s.persisterNotifier: // if the watchers are already caught up then let them wait, // else let them continue to do the catch up persistWatchers = append(persistWatchers, ew) } + + atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1) } } @@ -160,38 +230,95 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, return watchersNext } -func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64, - persistWatchers []*epochWatcher) []*epochWatcher { +func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, + persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) { // first, let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + // check the merger lag by counting the segment files on disk, + // On finding fewer files on disk, persister takes a short pause + // for sufficient in-memory segments to pile up for the next + // memory merge cum persist loop. + // On finding too many files on disk, persister pause until the merger + // catches up to reduce the segment file count under the threshold. + // But if there is memory pressure, then skip this sleep maneuvers. + numFilesOnDisk, _ := s.diskFileStats() + if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && + po.PersisterNapTimeMSec > 0 && s.paused() == 0 { + select { + case <-s.closeCh: + case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)): + atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1) + + case ew := <-s.persisterNotifier: + // unblock the merger in meantime + persistWatchers = append(persistWatchers, ew) + lastMergedEpoch = ew.epoch + persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1) + } + return lastMergedEpoch, persistWatchers + } + OUTER: - // check for slow merger and await until the merger catch up - for lastPersistedEpoch > *lastMergedEpoch+epochDistance { + for po.PersisterNapUnderNumFiles > 0 && + numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) && + lastMergedEpoch < lastPersistedEpoch { + atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1) select { case <-s.closeCh: break OUTER case ew := <-s.persisterNotifier: persistWatchers = append(persistWatchers, ew) - *lastMergedEpoch = ew.epoch + lastMergedEpoch = ew.epoch } + atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1) + // let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + + numFilesOnDisk, _ = s.diskFileStats() } - return persistWatchers + return lastMergedEpoch, persistWatchers } -func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { - persisted, err := s.persistSnapshotMaybeMerge(snapshot) - if err != nil { - return err +func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { + po := persisterOptions{ + PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, + PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, + MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold, } - if persisted { - return nil + if v, ok := s.config["scorchPersisterOptions"]; ok { + b, err := json.Marshal(v) + if err != nil { + return &po, err + } + + err = json.Unmarshal(b, &po) + if err != nil { + return &po, err + } + } + return &po, nil +} + +func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, + po *persisterOptions) error { + // Perform in-memory segment merging only when the memory pressure is + // below the configured threshold, else the persister performs the + // direct persistence of segments. + if s.paused() < po.MemoryPressurePauseThreshold { + persisted, err := s.persistSnapshotMaybeMerge(snapshot) + if err != nil { + return err + } + if persisted { + return nil + } } return s.persistSnapshotDirect(snapshot) @@ -224,7 +351,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( return false, nil } - _, newSnapshot, newSegmentID, err := s.mergeSegmentBases( + newSnapshot, newSegmentID, err := s.mergeSegmentBases( snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor) if err != nil { return false, err @@ -249,6 +376,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)), internal: snapshot.internal, epoch: snapshot.epoch, + creator: "persistSnapshotMaybeMerge", } // copy to the equiv the segments that weren't replaced @@ -301,6 +429,22 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { return err } + // persist meta values + metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey) + if err != nil { + return err + } + err = metaBucket.Put([]byte("type"), []byte(zap.Type)) + if err != nil { + return err + } + buf := make([]byte, binary.MaxVarintLen32) + binary.BigEndian.PutUint32(buf, zap.Version) + err = metaBucket.Put([]byte("version"), buf) + if err != nil { + return err + } + // persist internal values internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey) if err != nil { @@ -390,44 +534,21 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { } } - s.rootLock.Lock() - newIndexSnapshot := &IndexSnapshot{ - parent: s, - epoch: s.nextSnapshotEpoch, - segment: make([]*SegmentSnapshot, len(s.root.segment)), - offsets: make([]uint64, len(s.root.offsets)), - internal: make(map[string][]byte, len(s.root.internal)), - refs: 1, - } - s.nextSnapshotEpoch++ - for i, segmentSnapshot := range s.root.segment { - // see if this segment has been replaced - if replacement, ok := newSegments[segmentSnapshot.id]; ok { - newSegmentSnapshot := &SegmentSnapshot{ - id: segmentSnapshot.id, - segment: replacement, - deleted: segmentSnapshot.deleted, - cachedDocs: segmentSnapshot.cachedDocs, - } - newIndexSnapshot.segment[i] = newSegmentSnapshot - delete(newSegments, segmentSnapshot.id) - // update items persisted incase of a new segment snapshot - atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count()) - } else { - newIndexSnapshot.segment[i] = s.root.segment[i] - newIndexSnapshot.segment[i].segment.AddRef() - } - newIndexSnapshot.offsets[i] = s.root.offsets[i] + persist := &persistIntroduction{ + persisted: newSegments, + applied: make(notificationChan), } - for k, v := range s.root.internal { - newIndexSnapshot.internal[k] = v + + select { + case <-s.closeCh: + return segment.ErrClosed + case s.persists <- persist: } - rootPrev := s.root - s.root = newIndexSnapshot - s.rootLock.Unlock() - if rootPrev != nil { - _ = rootPrev.DecRef() + select { + case <-s.closeCh: + return segment.ErrClosed + case <-persist.applied: } } @@ -462,6 +583,7 @@ var boltSnapshotsBucket = []byte{'s'} var boltPathKey = []byte{'p'} var boltDeletedKey = []byte{'d'} var boltInternalKey = []byte{'i'} +var boltMetaDataKey = []byte{'m'} func (s *Scorch) loadFromBolt() error { return s.rootBolt.View(func(tx *bolt.Tx) error { @@ -478,19 +600,19 @@ func (s *Scorch) loadFromBolt() error { continue } if foundRoot { - s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) + s.AddEligibleForRemoval(snapshotEpoch) continue } snapshot := snapshots.Bucket(k) if snapshot == nil { log.Printf("snapshot key, but bucket missing %x, continuing", k) - s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) + s.AddEligibleForRemoval(snapshotEpoch) continue } indexSnapshot, err := s.loadSnapshot(snapshot) if err != nil { log.Printf("unable to load snapshot, %v, continuing", err) - s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) + s.AddEligibleForRemoval(snapshotEpoch) continue } indexSnapshot.epoch = snapshotEpoch @@ -500,13 +622,16 @@ func (s *Scorch) loadFromBolt() error { return err } s.nextSegmentID++ - s.nextSnapshotEpoch = snapshotEpoch + 1 s.rootLock.Lock() - if s.root != nil { - _ = s.root.DecRef() - } + s.nextSnapshotEpoch = snapshotEpoch + 1 + rootPrev := s.root s.root = indexSnapshot s.rootLock.Unlock() + + if rootPrev != nil { + _ = rootPrev.DecRef() + } + foundRoot = true } return nil @@ -524,7 +649,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { snapshotKey := segment.EncodeUvarintAscending(nil, epoch) snapshot := snapshots.Bucket(snapshotKey) if snapshot == nil { - return nil + return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch) } rv, err = s.loadSnapshot(snapshot) return err @@ -536,12 +661,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { } func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { - rv := &IndexSnapshot{ parent: s, internal: make(map[string][]byte), refs: 1, + creator: "loadSnapshot", } + var running uint64 c := snapshot.Cursor() for k, _ := c.First(); k != nil; k, _ = c.Next() { @@ -556,7 +682,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { _ = rv.DecRef() return nil, err } - } else { + } else if k[0] != boltMetaDataKey[0] { segmentBucket := snapshot.Bucket(k) if segmentBucket == nil { _ = rv.DecRef() @@ -577,6 +703,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { running += segmentSnapshot.segment.Count() } } + return rv, nil } @@ -604,7 +731,9 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro _ = segment.Close() return nil, fmt.Errorf("error reading deleted bytes: %v", err) } - rv.deleted = deletedBitmap + if !deletedBitmap.IsEmpty() { + rv.deleted = deletedBitmap + } } return rv, nil @@ -643,14 +772,14 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { return 0, err } - if len(persistedEpochs) <= NumSnapshotsToKeep { + if len(persistedEpochs) <= s.numSnapshotsToKeep { // we need to keep everything return 0, nil } // make a map of epochs to protect from deletion - protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep) - for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] { + protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep) + for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] { protectedEpochs[epoch] = struct{}{} } @@ -668,7 +797,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { s.eligibleForRemoval = newEligible s.rootLock.Unlock() - if len(epochsToRemove) <= 0 { + if len(epochsToRemove) == 0 { return 0, nil } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/reader.go b/vendor/github.com/blevesearch/bleve/index/scorch/reader.go deleted file mode 100644 index 365ecb670..000000000 --- a/vendor/github.com/blevesearch/bleve/index/scorch/reader.go +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package scorch - -import ( - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" -) - -type Reader struct { - root *IndexSnapshot // Owns 1 ref-count on the index snapshot. -} - -func (r *Reader) TermFieldReader(term []byte, field string, includeFreq, - includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { - return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors) -} - -// DocIDReader returns an iterator over all doc ids -// The caller must close returned instance to release associated resources. -func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) { - return r.root.DocIDReaderAll() -} - -func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { - return r.root.DocIDReaderOnly(ids) -} - -func (r *Reader) FieldDict(field string) (index.FieldDict, error) { - return r.root.FieldDict(field) -} - -// FieldDictRange is currently defined to include the start and end terms -func (r *Reader) FieldDictRange(field string, startTerm []byte, - endTerm []byte) (index.FieldDict, error) { - return r.root.FieldDictRange(field, startTerm, endTerm) -} - -func (r *Reader) FieldDictPrefix(field string, - termPrefix []byte) (index.FieldDict, error) { - return r.root.FieldDictPrefix(field, termPrefix) -} - -func (r *Reader) Document(id string) (*document.Document, error) { - return r.root.Document(id) -} -func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, - visitor index.DocumentFieldTermVisitor) error { - return r.root.DocumentVisitFieldTerms(id, fields, visitor) -} - -func (r *Reader) Fields() ([]string, error) { - return r.root.Fields() -} - -func (r *Reader) GetInternal(key []byte) ([]byte, error) { - return r.root.GetInternal(key) -} - -func (r *Reader) DocCount() (uint64, error) { - return r.root.DocCount() -} - -func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) { - return r.root.ExternalID(id) -} - -func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { - return r.root.InternalID(id) -} - -func (r *Reader) DumpAll() chan interface{} { - rv := make(chan interface{}) - go func() { - close(rv) - }() - return rv -} - -func (r *Reader) DumpDoc(id string) chan interface{} { - rv := make(chan interface{}) - go func() { - close(rv) - }() - return rv -} - -func (r *Reader) DumpFields() chan interface{} { - rv := make(chan interface{}) - go func() { - close(rv) - }() - return rv -} - -func (r *Reader) Close() error { - return r.root.DecRef() -} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/scorch.go b/vendor/github.com/blevesearch/bleve/index/scorch/scorch.go index f539313d1..3f3d8bffc 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/scorch.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/scorch.go @@ -17,6 +17,7 @@ package scorch import ( "encoding/json" "fmt" + "io/ioutil" "os" "sync" "sync/atomic" @@ -27,23 +28,24 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/mem" "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) const Name = "scorch" -const Version uint8 = 1 +const Version uint8 = 2 + +var ErrClosed = fmt.Errorf("scorch closed") type Scorch struct { readOnly bool version uint8 config map[string]interface{} analysisQueue *index.AnalysisQueue - stats *Stats + stats Stats nextSegmentID uint64 path string @@ -52,12 +54,15 @@ type Scorch struct { rootLock sync.RWMutex root *IndexSnapshot // holds 1 ref-count on the root rootPersisted []chan error // closed when root is persisted + persistedCallbacks []index.BatchCallback nextSnapshotEpoch uint64 eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. + numSnapshotsToKeep int closeCh chan struct{} introductions chan *segmentIntroduction + persists chan *persistIntroduction merges chan *segmentMerge introducerNotifier chan *epochWatcher revertToSnapshots chan *snapshotReversion @@ -67,6 +72,23 @@ type Scorch struct { onEvent func(event Event) onAsyncError func(err error) + + iStats internalStats + + pauseLock sync.RWMutex + + pauseCount uint64 +} + +type internalStats struct { + persistEpoch uint64 + persistSnapshotSize uint64 + mergeEpoch uint64 + mergeSnapshotSize uint64 + newSegBufBytesAdded uint64 + newSegBufBytesRemoved uint64 + analysisBytesAdded uint64 + analysisBytesRemoved uint64 } func NewScorch(storeName string, @@ -80,8 +102,7 @@ func NewScorch(storeName string, closeCh: make(chan struct{}), ineligibleForRemoval: map[string]bool{}, } - rv.stats = &Stats{i: rv} - rv.root = &IndexSnapshot{parent: rv, refs: 1} + rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"} ro, ok := config["read_only"].(bool) if ok { rv.readOnly = ro @@ -101,9 +122,30 @@ func NewScorch(storeName string, return rv, nil } +func (s *Scorch) paused() uint64 { + s.pauseLock.Lock() + pc := s.pauseCount + s.pauseLock.Unlock() + return pc +} + +func (s *Scorch) incrPause() { + s.pauseLock.Lock() + s.pauseCount++ + s.pauseLock.Unlock() +} + +func (s *Scorch) decrPause() { + s.pauseLock.Lock() + s.pauseCount-- + s.pauseLock.Unlock() +} + func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { if s.onEvent != nil { + s.incrPause() s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) + s.decrPause() } } @@ -111,6 +153,7 @@ func (s *Scorch) fireAsyncError(err error) { if s.onAsyncError != nil { s.onAsyncError(err) } + atomic.AddUint64(&s.stats.TotOnErrors, 1) } func (s *Scorch) Open() error { @@ -172,7 +215,10 @@ func (s *Scorch) openBolt() error { } } + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment))) + s.introductions = make(chan *segmentIntroduction) + s.persists = make(chan *persistIntroduction) s.merges = make(chan *segmentMerge) s.introducerNotifier = make(chan *epochWatcher, 1) s.revertToSnapshots = make(chan *snapshotReversion) @@ -186,6 +232,17 @@ func (s *Scorch) openBolt() error { } } + s.numSnapshotsToKeep = NumSnapshotsToKeep + if v, ok := s.config["numSnapshotsToKeep"]; ok { + var t int + if t, err = parseToInteger(v); err != nil { + return fmt.Errorf("numSnapshotsToKeep parse err: %v", err) + } + if t > 0 { + s.numSnapshotsToKeep = t + } + } + return nil } @@ -255,65 +312,83 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { // FIXME could sort ids list concurrent with analysis? - go func() { - for _, doc := range batch.IndexOps { - if doc != nil { - aw := index.NewAnalysisWork(s, doc, resultChan) - // put the work on the queue - s.analysisQueue.Queue(aw) + if len(batch.IndexOps) > 0 { + go func() { + for _, doc := range batch.IndexOps { + if doc != nil { + aw := index.NewAnalysisWork(s, doc, resultChan) + // put the work on the queue + s.analysisQueue.Queue(aw) + } } - } - }() + }() + } // wait for analysis result analysisResults := make([]*index.AnalysisResult, int(numUpdates)) var itemsDeQueued uint64 + var totalAnalysisSize int for itemsDeQueued < numUpdates { result := <-resultChan + resultSize := result.Size() + atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize)) + totalAnalysisSize += resultSize analysisResults[itemsDeQueued] = result itemsDeQueued++ } close(resultChan) + defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize)) + + atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start))) - atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start))) + indexStart := time.Now() // notify handlers that we're about to introduce a segment s.fireEvent(EventKindBatchIntroductionStart, 0) var newSegment segment.Segment + var bufBytes uint64 if len(analysisResults) > 0 { - newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor) + newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor) if err != nil { return err } + atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes) + } else { + atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) } - err = s.prepareSegment(newSegment, ids, batch.InternalOps) + err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback()) if err != nil { if newSegment != nil { _ = newSegment.Close() } - atomic.AddUint64(&s.stats.errors, 1) + atomic.AddUint64(&s.stats.TotOnErrors, 1) } else { - atomic.AddUint64(&s.stats.updates, numUpdates) - atomic.AddUint64(&s.stats.deletes, numDeletes) - atomic.AddUint64(&s.stats.batches, 1) - atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes) + atomic.AddUint64(&s.stats.TotUpdates, numUpdates) + atomic.AddUint64(&s.stats.TotDeletes, numDeletes) + atomic.AddUint64(&s.stats.TotBatches, 1) + atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes) } + + atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes) + atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart))) + return err } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, - internalOps map[string][]byte) error { + internalOps map[string][]byte, persistedCallback index.BatchCallback) error { // new introduction introduction := &segmentIntroduction{ - id: atomic.AddUint64(&s.nextSegmentID, 1), - data: newSegment, - ids: ids, - obsoletes: make(map[uint64]*roaring.Bitmap), - internal: internalOps, - applied: make(chan error), + id: atomic.AddUint64(&s.nextSegmentID, 1), + data: newSegment, + ids: ids, + obsoletes: make(map[uint64]*roaring.Bitmap), + internal: internalOps, + applied: make(chan error), + persistedCallback: persistedCallback, } if !s.unsafeBatch { @@ -326,6 +401,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, root.AddRef() s.rootLock.RUnlock() + defer func() { _ = root.DecRef() }() + for _, seg := range root.segment { delta, err := seg.segment.DocNumbers(ids) if err != nil { @@ -334,7 +411,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, introduction.obsoletes[seg.id] = delta } - _ = root.DecRef() + introStartTime := time.Now() s.introductions <- introduction @@ -348,6 +425,12 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, err = <-introduction.persisted } + introTime := uint64(time.Since(introStartTime)) + atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime) + if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime { + atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime) + } + return err } @@ -366,18 +449,69 @@ func (s *Scorch) DeleteInternal(key []byte) error { // Reader returns a low-level accessor on the index data. Close it to // release associated resources. func (s *Scorch) Reader() (index.IndexReader, error) { + return s.currentSnapshot(), nil +} + +func (s *Scorch) currentSnapshot() *IndexSnapshot { s.rootLock.RLock() - rv := &Reader{root: s.root} - rv.root.AddRef() + rv := s.root + if rv != nil { + rv.AddRef() + } s.rootLock.RUnlock() - return rv, nil + return rv } func (s *Scorch) Stats() json.Marshaler { - return s.stats + return &s.stats } + +func (s *Scorch) diskFileStats() (uint64, uint64) { + var numFilesOnDisk, numBytesUsedDisk uint64 + if s.path != "" { + finfos, err := ioutil.ReadDir(s.path) + if err == nil { + for _, finfo := range finfos { + if !finfo.IsDir() { + numBytesUsedDisk += uint64(finfo.Size()) + numFilesOnDisk++ + } + } + } + } + return numFilesOnDisk, numBytesUsedDisk +} + func (s *Scorch) StatsMap() map[string]interface{} { - m, _ := s.stats.statsMap() + m := s.stats.ToMap() + + numFilesOnDisk, numBytesUsedDisk := s.diskFileStats() + + m["CurOnDiskBytes"] = numBytesUsedDisk + m["CurOnDiskFiles"] = numFilesOnDisk + + // TODO: consider one day removing these backwards compatible + // names for apps using the old names + m["updates"] = m["TotUpdates"] + m["deletes"] = m["TotDeletes"] + m["batches"] = m["TotBatches"] + m["errors"] = m["TotOnErrors"] + m["analysis_time"] = m["TotAnalysisTime"] + m["index_time"] = m["TotIndexTime"] + m["term_searchers_started"] = m["TotTermSearchersStarted"] + m["term_searchers_finished"] = m["TotTermSearchersFinished"] + m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"] + m["num_items_introduced"] = m["TotIntroducedItems"] + m["num_items_persisted"] = m["TotPersistedItems"] + m["num_recs_to_persist"] = m["TotItemsToPersist"] + m["num_bytes_used_disk"] = m["CurOnDiskBytes"] + m["num_files_on_disk"] = m["CurOnDiskFiles"] + m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"] + m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"] + m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"] + m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"] + m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"] + return m } @@ -394,7 +528,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult { rv.Analyzed[i] = tokenFreqs rv.Length[i] = fieldLength - if len(d.CompositeFields) > 0 { + if len(d.CompositeFields) > 0 && field.Name() != "_id" { // see if any of the composite fields need this for _, compositeField := range d.CompositeFields { compositeField.Compose(field.Name(), fieldLength, tokenFreqs) @@ -418,20 +552,43 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { s.rootLock.Unlock() } -func (s *Scorch) MemoryUsed() uint64 { - var memUsed uint64 - s.rootLock.RLock() - if s.root != nil { - for _, segmentSnapshot := range s.root.segment { - memUsed += 8 /* size of id -> uint64 */ + - segmentSnapshot.segment.SizeInBytes() - if segmentSnapshot.deleted != nil { - memUsed += segmentSnapshot.deleted.GetSizeInBytes() - } - memUsed += segmentSnapshot.cachedDocs.sizeInBytes() - } +func (s *Scorch) MemoryUsed() (memUsed uint64) { + indexSnapshot := s.currentSnapshot() + if indexSnapshot == nil { + return } - s.rootLock.RUnlock() + + defer func() { + _ = indexSnapshot.Close() + }() + + // Account for current root snapshot overhead + memUsed += uint64(indexSnapshot.Size()) + + // Account for snapshot that the persister may be working on + persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch) + persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize) + if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch { + // the snapshot that the persister is working on isn't the same as + // the current snapshot + memUsed += persistSnapshotSize + } + + // Account for snapshot that the merger may be working on + mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch) + mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize) + if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch { + // the snapshot that the merger is working on isn't the same as + // the current snapshot + memUsed += mergeSnapshotSize + } + + memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) - + atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved)) + + memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) - + atomic.LoadUint64(&s.iStats.analysisBytesRemoved)) + return memUsed } @@ -450,3 +607,15 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) { func init() { registry.RegisterIndexType(Name, NewScorch) } + +func parseToInteger(i interface{}) (int, error) { + switch v := i.(type) { + case float64: + return int(v), nil + case int: + return v, nil + + default: + return 0, fmt.Errorf("expects int or float64 value") + } +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/empty.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/empty.go index 83454644d..165a01bc1 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/empty.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/empty.go @@ -17,6 +17,7 @@ package segment import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" + "github.com/couchbase/vellum" ) type EmptySegment struct{} @@ -29,6 +30,10 @@ func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisit return nil } +func (e *EmptySegment) DocID(num uint64) ([]byte, error) { + return nil, nil +} + func (e *EmptySegment) Count() uint64 { return 0 } @@ -46,6 +51,10 @@ func (e *EmptySegment) Close() error { return nil } +func (e *EmptySegment) Size() uint64 { + return 0 +} + func (e *EmptySegment) AddRef() { } @@ -55,8 +64,8 @@ func (e *EmptySegment) DecRef() error { type EmptyDictionary struct{} -func (e *EmptyDictionary) PostingsList(term string, - except *roaring.Bitmap) (PostingsList, error) { +func (e *EmptyDictionary) PostingsList(term []byte, + except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) { return &EmptyPostingsList{}, nil } @@ -72,18 +81,37 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { return &EmptyDictionaryIterator{} } +func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + +func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, + includeCount bool) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + type EmptyDictionaryIterator struct{} func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { return nil, nil } +func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { + return nil, nil +} + type EmptyPostingsList struct{} -func (e *EmptyPostingsList) Iterator() PostingsIterator { +func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool, + prealloc PostingsIterator) PostingsIterator { return &EmptyPostingsIterator{} } +func (e *EmptyPostingsList) Size() int { + return 0 +} + func (e *EmptyPostingsList) Count() uint64 { return 0 } @@ -93,3 +121,9 @@ type EmptyPostingsIterator struct{} func (e *EmptyPostingsIterator) Next() (Posting, error) { return nil, nil } + +func (e *EmptyPostingsIterator) Size() int { + return 0 +} + +var AnEmptyPostingsIterator = &EmptyPostingsIterator{} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/build.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/build.go deleted file mode 100644 index 57d60dc89..000000000 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/build.go +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "math" - "sort" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" -) - -// NewFromAnalyzedDocs places the analyzed document mutations into a new segment -func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { - s := New() - - // ensure that _id field get fieldID 0 - s.getOrDefineField("_id") - - // fill Dicts/DictKeys and preallocate memory - s.initializeDict(results) - - // walk each doc - for _, result := range results { - s.processDocument(result) - } - - // go back and sort the dictKeys - for _, dict := range s.DictKeys { - sort.Strings(dict) - } - - // compute memory usage of segment - s.updateSizeInBytes() - - // professional debugging - // - // log.Printf("fields: %v\n", s.FieldsMap) - // log.Printf("fieldsInv: %v\n", s.FieldsInv) - // log.Printf("fieldsLoc: %v\n", s.FieldsLoc) - // log.Printf("dicts: %v\n", s.Dicts) - // log.Printf("dict keys: %v\n", s.DictKeys) - // for i, posting := range s.Postings { - // log.Printf("posting %d: %v\n", i, posting) - // } - // for i, freq := range s.Freqs { - // log.Printf("freq %d: %v\n", i, freq) - // } - // for i, norm := range s.Norms { - // log.Printf("norm %d: %v\n", i, norm) - // } - // for i, field := range s.Locfields { - // log.Printf("field %d: %v\n", i, field) - // } - // for i, start := range s.Locstarts { - // log.Printf("start %d: %v\n", i, start) - // } - // for i, end := range s.Locends { - // log.Printf("end %d: %v\n", i, end) - // } - // for i, pos := range s.Locpos { - // log.Printf("pos %d: %v\n", i, pos) - // } - // for i, apos := range s.Locarraypos { - // log.Printf("apos %d: %v\n", i, apos) - // } - // log.Printf("stored: %v\n", s.Stored) - // log.Printf("stored types: %v\n", s.StoredTypes) - // log.Printf("stored pos: %v\n", s.StoredPos) - - return s -} - -// fill Dicts/DictKeys and preallocate memory for postings -func (s *Segment) initializeDict(results []*index.AnalysisResult) { - var numPostingsLists int - - numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. - numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. - - var numTokenFrequencies int - var totLocs int - - // initial scan for all fieldID's to sort them - for _, result := range results { - for _, field := range result.Document.CompositeFields { - s.getOrDefineField(field.Name()) - } - for _, field := range result.Document.Fields { - s.getOrDefineField(field.Name()) - } - } - sort.Strings(s.FieldsInv[1:]) // keep _id as first field - s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) - for fieldID, fieldName := range s.FieldsInv { - s.FieldsMap[fieldName] = uint16(fieldID + 1) - } - - processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { - for term, tf := range tfs { - pidPlus1, exists := s.Dicts[fieldID][term] - if !exists { - numPostingsLists++ - pidPlus1 = uint64(numPostingsLists) - s.Dicts[fieldID][term] = pidPlus1 - s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) - numTermsPerPostingsList = append(numTermsPerPostingsList, 0) - numLocsPerPostingsList = append(numLocsPerPostingsList, 0) - } - pid := pidPlus1 - 1 - numTermsPerPostingsList[pid] += 1 - numLocsPerPostingsList[pid] += len(tf.Locations) - totLocs += len(tf.Locations) - } - numTokenFrequencies += len(tfs) - } - - for _, result := range results { - // walk each composite field - for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name())) - _, tf := field.Analyze() - processField(fieldID, tf) - } - - // walk each field - for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) - tf := result.Analyzed[i] - processField(fieldID, tf) - } - } - - s.Postings = make([]*roaring.Bitmap, numPostingsLists) - for i := 0; i < numPostingsLists; i++ { - s.Postings[i] = roaring.New() - } - s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) - for i := 0; i < numPostingsLists; i++ { - s.PostingsLocs[i] = roaring.New() - } - - // Preallocate big, contiguous backing arrays. - auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos. - uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos. - float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms. - uint16Backing := make([]uint16, totLocs) // For sub-Locfields. - - // Point top-level slices to the backing arrays. - s.Freqs = auint64Backing[0:numPostingsLists] - auint64Backing = auint64Backing[numPostingsLists:] - - s.Norms = make([][]float32, numPostingsLists) - - s.Locfields = make([][]uint16, numPostingsLists) - - s.Locstarts = auint64Backing[0:numPostingsLists] - auint64Backing = auint64Backing[numPostingsLists:] - - s.Locends = auint64Backing[0:numPostingsLists] - auint64Backing = auint64Backing[numPostingsLists:] - - s.Locpos = auint64Backing[0:numPostingsLists] - auint64Backing = auint64Backing[numPostingsLists:] - - s.Locarraypos = make([][][]uint64, numPostingsLists) - - // Point sub-slices to the backing arrays. - for pid, numTerms := range numTermsPerPostingsList { - s.Freqs[pid] = uint64Backing[0:0] - uint64Backing = uint64Backing[numTerms:] - - s.Norms[pid] = float32Backing[0:0] - float32Backing = float32Backing[numTerms:] - } - - for pid, numLocs := range numLocsPerPostingsList { - s.Locfields[pid] = uint16Backing[0:0] - uint16Backing = uint16Backing[numLocs:] - - s.Locstarts[pid] = uint64Backing[0:0] - uint64Backing = uint64Backing[numLocs:] - - s.Locends[pid] = uint64Backing[0:0] - uint64Backing = uint64Backing[numLocs:] - - s.Locpos[pid] = uint64Backing[0:0] - uint64Backing = uint64Backing[numLocs:] - - s.Locarraypos[pid] = auint64Backing[0:0] - auint64Backing = auint64Backing[numLocs:] - } -} - -func (s *Segment) processDocument(result *index.AnalysisResult) { - // used to collate information across fields - docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) - fieldLens := make(map[uint16]int, len(s.FieldsMap)) - - docNum := uint64(s.addDocument()) - - processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { - fieldLens[field] += l - if existingFreqs, ok := docMap[field]; ok { - existingFreqs.MergeAll(name, tf) - } else { - docMap[field] = tf - } - } - - storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { - s.Stored[docNum][field] = append(s.Stored[docNum][field], val) - s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ) - s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos) - } - - // walk each composite field - for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name())) - l, tf := field.Analyze() - processField(fieldID, field.Name(), l, tf) - } - - // walk each field - for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) - l := result.Length[i] - tf := result.Analyzed[i] - processField(fieldID, field.Name(), l, tf) - if field.Options().IsStored() { - storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) - } - - if field.Options().IncludeDocValues() { - s.DocValueFields[fieldID] = true - } - } - - // now that its been rolled up into docMap, walk that - for fieldID, tokenFrequencies := range docMap { - for term, tokenFreq := range tokenFrequencies { - pid := s.Dicts[fieldID][term] - 1 - bs := s.Postings[pid] - bs.AddInt(int(docNum)) - s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) - s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) - locationBS := s.PostingsLocs[pid] - if len(tokenFreq.Locations) > 0 { - locationBS.AddInt(int(docNum)) - for _, loc := range tokenFreq.Locations { - var locf = fieldID - if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field)) - } - s.Locfields[pid] = append(s.Locfields[pid], locf) - s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) - s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) - s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) - if len(loc.ArrayPositions) > 0 { - s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) - } else { - s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) - } - } - } - } - } -} - -func (s *Segment) getOrDefineField(name string) int { - fieldIDPlus1, ok := s.FieldsMap[name] - if !ok { - fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) - s.FieldsMap[name] = fieldIDPlus1 - s.FieldsInv = append(s.FieldsInv, name) - s.Dicts = append(s.Dicts, make(map[string]uint64)) - s.DictKeys = append(s.DictKeys, make([]string, 0)) - } - return int(fieldIDPlus1 - 1) -} - -func (s *Segment) addDocument() int { - docNum := len(s.Stored) - s.Stored = append(s.Stored, map[uint16][][]byte{}) - s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{}) - s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{}) - return docNum -} - -func encodeFieldType(f document.Field) byte { - fieldType := byte('x') - switch f.(type) { - case *document.TextField: - fieldType = 't' - case *document.NumericField: - fieldType = 'n' - case *document.DateTimeField: - fieldType = 'd' - case *document.BooleanField: - fieldType = 'b' - case *document.GeoPointField: - fieldType = 'g' - case *document.CompositeField: - fieldType = 'c' - } - return fieldType -} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/dict.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/dict.go deleted file mode 100644 index cf92ef71f..000000000 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/dict.go +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "sort" - "strings" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment" -) - -// Dictionary is the in-memory representation of the term dictionary -type Dictionary struct { - segment *Segment - field string - fieldID uint16 -} - -// PostingsList returns the postings list for the specified term -func (d *Dictionary) PostingsList(term string, - except *roaring.Bitmap) (segment.PostingsList, error) { - return &PostingsList{ - dictionary: d, - term: term, - postingsID: d.segment.Dicts[d.fieldID][term], - except: except, - }, nil -} - -// Iterator returns an iterator for this dictionary -func (d *Dictionary) Iterator() segment.DictionaryIterator { - return &DictionaryIterator{ - d: d, - } -} - -// PrefixIterator returns an iterator which only visits terms having the -// the specified prefix -func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { - offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix) - return &DictionaryIterator{ - d: d, - prefix: prefix, - offset: offset, - } -} - -// RangeIterator returns an iterator which only visits terms between the -// start and end terms. NOTE: bleve.index API specifies the end is inclusive. -func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { - offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start) - return &DictionaryIterator{ - d: d, - offset: offset, - end: end, - } -} - -// DictionaryIterator is an iterator for term dictionary -type DictionaryIterator struct { - d *Dictionary - prefix string - end string - offset int - - dictEntry index.DictEntry // reused across Next()'s -} - -// Next returns the next entry in the dictionary -func (d *DictionaryIterator) Next() (*index.DictEntry, error) { - if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { - return nil, nil - } - next := d.d.segment.DictKeys[d.d.fieldID][d.offset] - // check prefix - if d.prefix != "" && !strings.HasPrefix(next, d.prefix) { - return nil, nil - } - // check end (bleve.index API demands inclusive end) - if d.end != "" && next > d.end { - return nil, nil - } - - d.offset++ - postingID := d.d.segment.Dicts[d.d.fieldID][next] - d.dictEntry.Term = next - d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality() - return &d.dictEntry, nil -} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/posting.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/posting.go deleted file mode 100644 index d91a00561..000000000 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/posting.go +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index/scorch/segment" -) - -// PostingsList is an in-memory represenation of a postings list -type PostingsList struct { - dictionary *Dictionary - term string - postingsID uint64 - except *roaring.Bitmap -} - -// Count returns the number of items on this postings list -func (p *PostingsList) Count() uint64 { - var rv uint64 - if p.postingsID > 0 { - rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality() - if p.except != nil { - except := p.except.GetCardinality() - if except > rv { - // avoid underflow - except = rv - } - rv -= except - } - } - return rv -} - -// Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator() segment.PostingsIterator { - rv := &PostingsIterator{ - postings: p, - } - if p.postingsID > 0 { - allbits := p.dictionary.segment.Postings[p.postingsID-1] - rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] - rv.all = allbits.Iterator() - if p.except != nil { - allExcept := allbits.Clone() - allExcept.AndNot(p.except) - rv.actual = allExcept.Iterator() - } else { - rv.actual = allbits.Iterator() - } - } - - return rv -} - -// PostingsIterator provides a way to iterate through the postings list -type PostingsIterator struct { - postings *PostingsList - all roaring.IntIterable - locations *roaring.Bitmap - offset int - locoffset int - actual roaring.IntIterable -} - -// Next returns the next posting on the postings list, or nil at the end -func (i *PostingsIterator) Next() (segment.Posting, error) { - if i.actual == nil || !i.actual.HasNext() { - return nil, nil - } - n := i.actual.Next() - allN := i.all.Next() - - // n is the next actual hit (excluding some postings) - // allN is the next hit in the full postings - // if they don't match, adjust offsets to factor in item we're skipping over - // incr the all iterator, and check again - for allN != n { - i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) - i.offset++ - allN = i.all.Next() - } - rv := &Posting{ - iterator: i, - docNum: uint64(n), - offset: i.offset, - locoffset: i.locoffset, - hasLoc: i.locations.Contains(n), - } - - i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) - i.offset++ - return rv, nil -} - -// Posting is a single entry in a postings list -type Posting struct { - iterator *PostingsIterator - docNum uint64 - offset int - locoffset int - hasLoc bool -} - -// Number returns the document number of this posting in this segment -func (p *Posting) Number() uint64 { - return p.docNum -} - -// Frequency returns the frequence of occurance of this term in this doc/field -func (p *Posting) Frequency() uint64 { - return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset] -} - -// Norm returns the normalization factor for this posting -func (p *Posting) Norm() float64 { - return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset]) -} - -// Locations returns the location information for each occurance -func (p *Posting) Locations() []segment.Location { - if !p.hasLoc { - return nil - } - freq := int(p.Frequency()) - rv := make([]segment.Location, freq) - for i := 0; i < freq; i++ { - rv[i] = &Location{ - p: p, - offset: p.locoffset + i, - } - } - return rv -} - -// Location represents the location of a single occurance -type Location struct { - p *Posting - offset int -} - -// Field returns the name of the field (useful in composite fields to know -// which original field the value came from) -func (l *Location) Field() string { - return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]] -} - -// Start returns the start byte offset of this occurance -func (l *Location) Start() uint64 { - return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset] -} - -// End returns the end byte offset of this occurance -func (l *Location) End() uint64 { - return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset] -} - -// Pos returns the 1-based phrase position of this occurance -func (l *Location) Pos() uint64 { - return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset] -} - -// ArrayPositions returns the array position vector associated with this occurance -func (l *Location) ArrayPositions() []uint64 { - return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset] -} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/segment.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/segment.go deleted file mode 100644 index 04bdb368a..000000000 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/segment.go +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "fmt" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index/scorch/segment" -) - -// _id field is always guaranteed to have fieldID of 0 -const idFieldID uint16 = 0 - -// KNOWN ISSUES -// - LIMITATION - we decided whether or not to store term vectors for a field -// at the segment level, based on the first definition of a -// field we see. in normal bleve usage this is fine, all -// instances of a field definition will be the same. however, -// advanced users may violate this and provide unique field -// definitions with each document. this segment does not -// support this usage. - -// TODO -// - need better testing of multiple docs, iterating freqs, locations and -// and verifying the correct results are returned - -// Segment is an in memory implementation of scorch.Segment -type Segment struct { - - // FieldsMap adds 1 to field id to avoid zero value issues - // name -> field id + 1 - FieldsMap map[string]uint16 - - // FieldsInv is the inverse of FieldsMap - // field id -> name - FieldsInv []string - - // Term dictionaries for each field - // field id -> term -> postings list id + 1 - Dicts []map[string]uint64 - - // Terms for each field, where terms are sorted ascending - // field id -> []term - DictKeys [][]string - - // Postings list - // postings list id -> bitmap by docNum - Postings []*roaring.Bitmap - - // Postings list has locations - PostingsLocs []*roaring.Bitmap - - // Term frequencies - // postings list id -> Freqs (one for each hit in bitmap) - Freqs [][]uint64 - - // Field norms - // postings list id -> Norms (one for each hit in bitmap) - Norms [][]float32 - - // Field/start/end/pos/locarraypos - // postings list id -> start/end/pos/locarraypos (one for each freq) - Locfields [][]uint16 - Locstarts [][]uint64 - Locends [][]uint64 - Locpos [][]uint64 - Locarraypos [][][]uint64 - - // Stored field values - // docNum -> field id -> slice of values (each value []byte) - Stored []map[uint16][][]byte - - // Stored field types - // docNum -> field id -> slice of types (each type byte) - StoredTypes []map[uint16][]byte - - // Stored field array positions - // docNum -> field id -> slice of array positions (each is []uint64) - StoredPos []map[uint16][][]uint64 - - // For storing the docValue persisted fields - DocValueFields map[uint16]bool - - // Footprint of the segment, updated when analyzed document mutations - // are added into the segment - sizeInBytes uint64 -} - -// New builds a new empty Segment -func New() *Segment { - return &Segment{ - FieldsMap: map[string]uint16{}, - DocValueFields: map[uint16]bool{}, - } -} - -func (s *Segment) updateSizeInBytes() { - var sizeInBytes uint64 - - // FieldsMap, FieldsInv - for k, _ := range s.FieldsMap { - sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + - 2 /* size of uint16 */) - } - // overhead from the data structures - sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) - - // Dicts, DictKeys - for _, entry := range s.Dicts { - for k, _ := range entry { - sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + - 8 /* size of uint64 */) - } - // overhead from the data structures - sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) - } - sizeInBytes += (segment.SizeOfSlice * 2) - - // Postings, PostingsLocs - for i := 0; i < len(s.Postings); i++ { - sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) + - (s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer) - } - sizeInBytes += (segment.SizeOfSlice * 2) - - // Freqs, Norms - for i := 0; i < len(s.Freqs); i++ { - sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + - len(s.Norms[i])*4 /* size of float32 */) + - (segment.SizeOfSlice * 2) - } - sizeInBytes += (segment.SizeOfSlice * 2) - - // Location data - for i := 0; i < len(s.Locfields); i++ { - sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + - len(s.Locstarts[i])*8 /* size of uint64 */ + - len(s.Locends[i])*8 /* size of uint64 */ + - len(s.Locpos[i])*8 /* size of uint64 */) - - for j := 0; j < len(s.Locarraypos[i]); j++ { - sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) + - segment.SizeOfSlice - } - - sizeInBytes += (segment.SizeOfSlice * 5) - } - sizeInBytes += (segment.SizeOfSlice * 5) - - // Stored data - for i := 0; i < len(s.Stored); i++ { - for _, v := range s.Stored[i] { - sizeInBytes += uint64(2 /* size of uint16 */) - for _, arr := range v { - sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice - } - sizeInBytes += segment.SizeOfSlice - } - - for _, v := range s.StoredTypes[i] { - sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice - } - - for _, v := range s.StoredPos[i] { - sizeInBytes += uint64(2 /* size of uint16 */) - for _, arr := range v { - sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) + - segment.SizeOfSlice - } - sizeInBytes += segment.SizeOfSlice - } - - // overhead from map(s) within Stored, StoredTypes, StoredPos - sizeInBytes += (segment.SizeOfMap * 3) - } - // overhead from data structures: Stored, StoredTypes, StoredPos - sizeInBytes += (segment.SizeOfSlice * 3) - - // DocValueFields - sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) + - segment.SizeOfMap - - // SizeInBytes - sizeInBytes += uint64(8) - - s.sizeInBytes = sizeInBytes -} - -func (s *Segment) SizeInBytes() uint64 { - return s.sizeInBytes -} - -func (s *Segment) AddRef() { -} - -func (s *Segment) DecRef() error { - return nil -} - -// Fields returns the field names used in this segment -func (s *Segment) Fields() []string { - return s.FieldsInv -} - -// VisitDocument invokes the DocFieldValueVistor for each stored field -// for the specified doc number -func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { - // ensure document number exists - if int(num) > len(s.Stored)-1 { - return nil - } - docFields := s.Stored[int(num)] - st := s.StoredTypes[int(num)] - sp := s.StoredPos[int(num)] - for field, values := range docFields { - for i, value := range values { - keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) - if !keepGoing { - return nil - } - } - } - return nil -} - -func (s *Segment) getField(name string) (int, error) { - fieldID, ok := s.FieldsMap[name] - if !ok { - return 0, fmt.Errorf("no field named %s", name) - } - return int(fieldID - 1), nil -} - -// Dictionary returns the term dictionary for the specified field -func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { - fieldID, err := s.getField(field) - if err != nil { - // no such field, return empty dictionary - return &segment.EmptyDictionary{}, nil - } - return &Dictionary{ - segment: s, - field: field, - fieldID: uint16(fieldID), - }, nil -} - -// Count returns the number of documents in this segment -// (this has no notion of deleted docs) -func (s *Segment) Count() uint64 { - return uint64(len(s.Stored)) -} - -// DocNumbers returns a bitset corresponding to the doc numbers of all the -// provided _id strings -func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { - rv := roaring.New() - - // guard against empty segment - if len(s.FieldsMap) > 0 { - idDictionary := s.Dicts[idFieldID] - - for _, id := range ids { - postingID := idDictionary[id] - if postingID > 0 { - rv.Or(s.Postings[postingID-1]) - } - } - } - return rv, nil -} - -// Close releases all resources associated with this segment -func (s *Segment) Close() error { - return nil -} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go new file mode 100644 index 000000000..3aa151d64 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go @@ -0,0 +1,75 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package segment + +import ( + "regexp/syntax" + + "github.com/couchbase/vellum/regexp" +) + +func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) { + // TODO: potential optimization where syntax.Regexp supports a Simplify() API? + + parsed, err := syntax.Parse(pattern, syntax.Perl) + if err != nil { + return nil, nil, nil, err + } + + re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit) + if err != nil { + return nil, nil, nil, err + } + + prefix := LiteralPrefix(parsed) + if prefix != "" { + prefixBeg := []byte(prefix) + prefixEnd := IncrementBytes(prefixBeg) + return re, prefixBeg, prefixEnd, nil + } + + return re, nil, nil, nil +} + +// Returns the literal prefix given the parse tree for a regexp +func LiteralPrefix(s *syntax.Regexp) string { + // traverse the left-most branch in the parse tree as long as the + // node represents a concatenation + for s != nil && s.Op == syntax.OpConcat { + if len(s.Sub) < 1 { + return "" + } + + s = s.Sub[0] + } + + if s.Op == syntax.OpLiteral { + return string(s.Rune) + } + + return "" // no literal prefix +} + +func IncrementBytes(in []byte) []byte { + rv := make([]byte, len(in)) + copy(rv, in) + for i := len(rv) - 1; i >= 0; i-- { + rv[i] = rv[i] + 1 + if rv[i] != 0 { + return rv // didn't overflow, so stop + } + } + return nil // overflowed +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/segment.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/segment.go index d5435ab96..b94d6f979 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/segment.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/segment.go @@ -15,15 +15,14 @@ package segment import ( + "fmt" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" + "github.com/couchbase/vellum" ) -// Overhead from go data structures when deployed on a 64-bit system. -const SizeOfMap uint64 = 8 -const SizeOfPointer uint64 = 8 -const SizeOfSlice uint64 = 24 -const SizeOfString uint64 = 16 +var ErrClosed = fmt.Errorf("index closed") // DocumentFieldValueVisitor defines a callback to be visited for each // stored field value. The return value determines if the visitor @@ -34,6 +33,9 @@ type Segment interface { Dictionary(field string) (TermDictionary, error) VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error + + DocID(num uint64) ([]byte, error) + Count() uint64 DocNumbers([]string) (*roaring.Bitmap, error) @@ -42,18 +44,21 @@ type Segment interface { Close() error - SizeInBytes() uint64 + Size() int AddRef() DecRef() error } type TermDictionary interface { - PostingsList(term string, except *roaring.Bitmap) (PostingsList, error) + PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator RangeIterator(start, end string) DictionaryIterator + AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) DictionaryIterator + OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator } type DictionaryIterator interface { @@ -61,7 +66,9 @@ type DictionaryIterator interface { } type PostingsList interface { - Iterator() PostingsIterator + Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator + + Size() int Count() uint64 @@ -77,6 +84,14 @@ type PostingsIterator interface { // implementations may return a shared instance to reduce memory // allocations. Next() (Posting, error) + + // Advance will return the posting with the specified doc number + // or if there is no such posting, the next posting. + // Callers MUST NOT attempt to pass a docNum that is less than or + // equal to the currently visited posting doc Num. + Advance(docNum uint64) (Posting, error) + + Size() int } type Posting interface { @@ -86,6 +101,8 @@ type Posting interface { Norm() float64 Locations() []Location + + Size() int } type Location interface { @@ -94,6 +111,7 @@ type Location interface { End() uint64 Pos() uint64 ArrayPositions() []uint64 + Size() int } // DocumentFieldTermVisitable is implemented by various scorch segment @@ -101,10 +119,17 @@ type Location interface { // postings or other indexed values. type DocumentFieldTermVisitable interface { VisitDocumentFieldTerms(localDocNum uint64, fields []string, - visitor index.DocumentFieldTermVisitor) error + visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error) // VisitableDocValueFields implementation should return // the list of fields which are document value persisted and // therefore visitable by the above VisitDocumentFieldTerms method. VisitableDocValueFields() ([]string, error) } + +type DocVisitState interface { +} + +type StatsReporter interface { + ReportBytesWritten(bytesWritten uint64) +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/build.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/build.go index 72357ae7d..91bfd4e24 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/build.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/build.go @@ -16,19 +16,13 @@ package zap import ( "bufio" - "bytes" - "encoding/binary" "math" "os" - "sort" - - "github.com/Smerity/govarint" - "github.com/blevesearch/bleve/index/scorch/segment/mem" - "github.com/couchbase/vellum" - "github.com/golang/snappy" ) -const version uint32 = 3 +const Version uint32 = 11 + +const Type string = "zap" const fieldNotUninverted = math.MaxUint64 @@ -82,219 +76,39 @@ func PersistSegmentBase(sb *SegmentBase, path string) error { return nil } -// PersistSegment takes the in-memory segment and persists it to -// the specified path in the zap file format. -func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error { - flag := os.O_RDWR | os.O_CREATE - - f, err := os.OpenFile(path, flag, 0600) - if err != nil { - return err - } - - cleanup := func() { - _ = f.Close() - _ = os.Remove(path) - } - - // buffer the output - br := bufio.NewWriter(f) - - // wrap it for counting (tracking offsets) - cr := NewCountHashWriter(br) - - numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err := - persistBase(memSegment, cr, chunkFactor) - if err != nil { - cleanup() - return err - } - - err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, - chunkFactor, cr.Sum32(), cr) - if err != nil { - cleanup() - return err - } - - err = br.Flush() - if err != nil { - cleanup() - return err - } - - err = f.Sync() - if err != nil { - cleanup() - return err - } - - err = f.Close() - if err != nil { - cleanup() - return err - } - - return nil -} - -func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) ( - numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, - dictLocs []uint64, err error) { - docValueOffset = uint64(fieldNotUninverted) - - if len(memSegment.Stored) > 0 { - storedIndexOffset, err = persistStored(memSegment, cr) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - postingsListLocs, err := persistPostingsLocs(memSegment, cr) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor) - if err != nil { - return 0, 0, 0, 0, nil, err - } - } else { - dictLocs = make([]uint64, len(memSegment.FieldsInv)) - } - - fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset, - dictLocs, nil -} - -func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) { - var curr int - var metaBuf bytes.Buffer - var data, compressed []byte - - metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) - - docNumOffsets := make(map[int]uint64, len(memSegment.Stored)) - - for docNum, storedValues := range memSegment.Stored { - if docNum != 0 { - // reset buffer if necessary - curr = 0 - metaBuf.Reset() - data = data[:0] - compressed = compressed[:0] - } - - st := memSegment.StoredTypes[docNum] - sp := memSegment.StoredPos[docNum] - - // encode fields in order - for fieldID := range memSegment.FieldsInv { - if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { - stf := st[uint16(fieldID)] - spf := sp[uint16(fieldID)] - - var err2 error - curr, data, err2 = persistStoredFieldValues(fieldID, - storedFieldValues, stf, spf, curr, metaEncoder, data) - if err2 != nil { - return 0, err2 - } - } - } - - metaEncoder.Close() - metaBytes := metaBuf.Bytes() - - // compress the data - compressed = snappy.Encode(compressed, data) - - // record where we're about to start writing - docNumOffsets[docNum] = uint64(w.Count()) - - // write out the meta len and compressed data len - _, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) - if err != nil { - return 0, err - } - - // now write the meta - _, err = w.Write(metaBytes) - if err != nil { - return 0, err - } - // now write the compressed data - _, err = w.Write(compressed) - if err != nil { - return 0, err - } - } - - // return value is the start of the stored index - rv := uint64(w.Count()) - // now write out the stored doc index - for docNum := range memSegment.Stored { - err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) - if err != nil { - return 0, err - } - } - - return rv, nil -} - func persistStoredFieldValues(fieldID int, storedFieldValues [][]byte, stf []byte, spf [][]uint64, - curr int, metaEncoder *govarint.Base128Encoder, data []byte) ( + curr int, metaEncode varintEncoder, data []byte) ( int, []byte, error) { for i := 0; i < len(storedFieldValues); i++ { // encode field - _, err := metaEncoder.PutU64(uint64(fieldID)) + _, err := metaEncode(uint64(fieldID)) if err != nil { return 0, nil, err } // encode type - _, err = metaEncoder.PutU64(uint64(stf[i])) + _, err = metaEncode(uint64(stf[i])) if err != nil { return 0, nil, err } // encode start offset - _, err = metaEncoder.PutU64(uint64(curr)) + _, err = metaEncode(uint64(curr)) if err != nil { return 0, nil, err } // end len - _, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + _, err = metaEncode(uint64(len(storedFieldValues[i]))) if err != nil { return 0, nil, err } // encode number of array pos - _, err = metaEncoder.PutU64(uint64(len(spf[i]))) + _, err = metaEncode(uint64(len(spf[i]))) if err != nil { return 0, nil, err } // encode all array positions for _, pos := range spf[i] { - _, err = metaEncoder.PutU64(pos) + _, err = metaEncode(pos) if err != nil { return 0, nil, err } @@ -307,337 +121,6 @@ func persistStoredFieldValues(fieldID int, return curr, data, nil } -func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { - var freqOffsets, locOfffsets []uint64 - tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) - for postingID := range memSegment.Postings { - if postingID != 0 { - tfEncoder.Reset() - } - freqs := memSegment.Freqs[postingID] - norms := memSegment.Norms[postingID] - postingsListItr := memSegment.Postings[postingID].Iterator() - var offset int - for postingsListItr.HasNext() { - - docNum := uint64(postingsListItr.Next()) - - // put freq - err := tfEncoder.Add(docNum, freqs[offset]) - if err != nil { - return nil, nil, err - } - - // put norm - norm := norms[offset] - normBits := math.Float32bits(norm) - err = tfEncoder.Add(docNum, uint64(normBits)) - if err != nil { - return nil, nil, err - } - - offset++ - } - - // record where this postings freq info starts - freqOffsets = append(freqOffsets, uint64(w.Count())) - - tfEncoder.Close() - _, err := tfEncoder.Write(w) - if err != nil { - return nil, nil, err - } - - } - - // now do it again for the locations - locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) - for postingID := range memSegment.Postings { - if postingID != 0 { - locEncoder.Reset() - } - freqs := memSegment.Freqs[postingID] - locfields := memSegment.Locfields[postingID] - locpos := memSegment.Locpos[postingID] - locstarts := memSegment.Locstarts[postingID] - locends := memSegment.Locends[postingID] - locarraypos := memSegment.Locarraypos[postingID] - postingsListItr := memSegment.Postings[postingID].Iterator() - var offset int - var locOffset int - for postingsListItr.HasNext() { - docNum := uint64(postingsListItr.Next()) - for i := 0; i < int(freqs[offset]); i++ { - if len(locfields) > 0 { - // put field - err := locEncoder.Add(docNum, uint64(locfields[locOffset])) - if err != nil { - return nil, nil, err - } - - // put pos - err = locEncoder.Add(docNum, locpos[locOffset]) - if err != nil { - return nil, nil, err - } - - // put start - err = locEncoder.Add(docNum, locstarts[locOffset]) - if err != nil { - return nil, nil, err - } - - // put end - err = locEncoder.Add(docNum, locends[locOffset]) - if err != nil { - return nil, nil, err - } - - // put the number of array positions to follow - num := len(locarraypos[locOffset]) - err = locEncoder.Add(docNum, uint64(num)) - if err != nil { - return nil, nil, err - } - - // put each array position - for _, pos := range locarraypos[locOffset] { - err = locEncoder.Add(docNum, pos) - if err != nil { - return nil, nil, err - } - } - } - locOffset++ - } - offset++ - } - - // record where this postings loc info starts - locOfffsets = append(locOfffsets, uint64(w.Count())) - locEncoder.Close() - _, err := locEncoder.Write(w) - if err != nil { - return nil, nil, err - } - } - return freqOffsets, locOfffsets, nil -} - -func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { - rv = make([]uint64, 0, len(memSegment.PostingsLocs)) - var reuseBuf bytes.Buffer - reuseBufVarint := make([]byte, binary.MaxVarintLen64) - for postingID := range memSegment.PostingsLocs { - // record where we start this posting loc - rv = append(rv, uint64(w.Count())) - // write out the length and bitmap - _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint) - if err != nil { - return nil, err - } - } - return rv, nil -} - -func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, - postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { - rv = make([]uint64, 0, len(memSegment.Postings)) - var reuseBuf bytes.Buffer - reuseBufVarint := make([]byte, binary.MaxVarintLen64) - for postingID := range memSegment.Postings { - // record where we start this posting list - rv = append(rv, uint64(w.Count())) - - // write out the term info, loc info, and loc posting list offset - _, err = writeUvarints(w, freqOffsets[postingID], - locOffsets[postingID], postingsListLocs[postingID]) - if err != nil { - return nil, err - } - - // write out the length and bitmap - _, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint) - if err != nil { - return nil, err - } - } - return rv, nil -} - -func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { - rv := make([]uint64, 0, len(memSegment.DictKeys)) - - varintBuf := make([]byte, binary.MaxVarintLen64) - - var buffer bytes.Buffer - for fieldID, fieldTerms := range memSegment.DictKeys { - if fieldID != 0 { - buffer.Reset() - } - - // start a new vellum for this field - builder, err := vellum.New(&buffer, nil) - if err != nil { - return nil, err - } - - dict := memSegment.Dicts[fieldID] - // now walk the dictionary in order of fieldTerms (already sorted) - for _, fieldTerm := range fieldTerms { - postingID := dict[fieldTerm] - 1 - postingsAddr := postingsLocs[postingID] - err = builder.Insert([]byte(fieldTerm), postingsAddr) - if err != nil { - return nil, err - } - } - err = builder.Close() - if err != nil { - return nil, err - } - - // record where this dictionary starts - rv = append(rv, uint64(w.Count())) - - vellumData := buffer.Bytes() - - // write out the length of the vellum data - n := binary.PutUvarint(varintBuf, uint64(len(vellumData))) - _, err = w.Write(varintBuf[:n]) - if err != nil { - return nil, err - } - - // write this vellum to disk - _, err = w.Write(vellumData) - if err != nil { - return nil, err - } - } - - return rv, nil -} - -type docIDRange []uint64 - -func (a docIDRange) Len() int { return len(a) } -func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] } -func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] } - -func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, - chunkFactor uint32) (map[uint16]uint64, error) { - fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv)) - fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) - - for fieldID := range memSegment.DocValueFields { - field := memSegment.FieldsInv[fieldID] - docTermMap := make(map[uint64][]byte, 0) - dict, err := memSegment.Dictionary(field) - if err != nil { - return nil, err - } - - dictItr := dict.Iterator() - next, err := dictItr.Next() - for err == nil && next != nil { - postings, err1 := dict.PostingsList(next.Term, nil) - if err1 != nil { - return nil, err - } - - postingsItr := postings.Iterator() - nextPosting, err2 := postingsItr.Next() - for err2 == nil && nextPosting != nil { - docNum := nextPosting.Number() - docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...) - docTermMap[docNum] = append(docTermMap[docNum], termSeparator) - nextPosting, err2 = postingsItr.Next() - } - if err2 != nil { - return nil, err2 - } - - next, err = dictItr.Next() - } - - if err != nil { - return nil, err - } - // sort wrt to docIDs - var docNumbers docIDRange - for k := range docTermMap { - docNumbers = append(docNumbers, k) - } - sort.Sort(docNumbers) - - for _, docNum := range docNumbers { - err = fdvEncoder.Add(docNum, docTermMap[docNum]) - if err != nil { - return nil, err - } - } - - fieldChunkOffsets[fieldID] = uint64(w.Count()) - err = fdvEncoder.Close() - if err != nil { - return nil, err - } - // persist the doc value details for this field - _, err = fdvEncoder.Write(w) - if err != nil { - return nil, err - } - // reseting encoder for the next field - fdvEncoder.Reset() - } - - return fieldChunkOffsets, nil -} - -func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter, - chunkFactor uint32) (uint64, error) { - fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor) - if err != nil { - return 0, err - } - - fieldDocValuesOffset := uint64(w.Count()) - buf := make([]byte, binary.MaxVarintLen64) - offset := uint64(0) - ok := true - for fieldID := range memSegment.FieldsInv { - // if the field isn't configured for docValue, then mark - // the offset accordingly - if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok { - offset = fieldNotUninverted - } - n := binary.PutUvarint(buf, uint64(offset)) - _, err := w.Write(buf[:n]) - if err != nil { - return 0, err - } - } - - return fieldDocValuesOffset, nil -} - -func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) { - var br bytes.Buffer - - cr := NewCountHashWriter(&br) - - numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err := - persistBase(memSegment, cr, chunkFactor) - if err != nil { - return nil, err - } - - return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, - memSegment.FieldsMap, memSegment.FieldsInv, numDocs, - storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs) -} - func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, @@ -653,10 +136,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, fieldsIndexOffset: fieldsIndexOffset, docValueOffset: docValueOffset, dictLocs: dictLocs, - fieldDvIterMap: make(map[uint16]*docValueIterator), + fieldDvReaders: make(map[uint16]*docValueReader), } + sb.updateSize() - err := sb.loadDvIterators() + err := sb.loadDvReaders() if err != nil { return nil, err } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/contentcoder.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/contentcoder.go index 83457146e..b9ff8179b 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/contentcoder.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/contentcoder.go @@ -18,41 +18,56 @@ import ( "bytes" "encoding/binary" "io" + "reflect" "github.com/golang/snappy" ) +var reflectStaticSizeMetaData int + +func init() { + var md MetaData + reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) +} + var termSeparator byte = 0xff var termSeparatorSplitSlice = []byte{termSeparator} type chunkedContentCoder struct { - final []byte - chunkSize uint64 - currChunk uint64 - chunkLens []uint64 + final []byte + chunkSize uint64 + currChunk uint64 + chunkLens []uint64 + + w io.Writer + progressiveWrite bool + chunkMetaBuf bytes.Buffer chunkBuf bytes.Buffer chunkMeta []MetaData + + compressed []byte // temp buf for snappy compression } // MetaData represents the data information inside a // chunk. type MetaData struct { - DocNum uint64 // docNum of the data inside the chunk - DocDvLoc uint64 // starting offset for a given docid - DocDvLen uint64 // length of data inside the chunk for the given docid + DocNum uint64 // docNum of the data inside the chunk + DocDvOffset uint64 // offset of data inside the chunk for the given docid } // newChunkedContentCoder returns a new chunk content coder which // packs data into chunks based on the provided chunkSize -func newChunkedContentCoder(chunkSize uint64, - maxDocNum uint64) *chunkedContentCoder { +func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, + w io.Writer, progressiveWrite bool) *chunkedContentCoder { total := maxDocNum/chunkSize + 1 rv := &chunkedContentCoder{ - chunkSize: chunkSize, - chunkLens: make([]uint64, total), - chunkMeta: make([]MetaData, 0, total), + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + chunkMeta: make([]MetaData, 0, total), + w: w, + progressiveWrite: progressiveWrite, } return rv @@ -88,7 +103,7 @@ func (c *chunkedContentCoder) flushContents() error { // write out the metaData slice for _, meta := range c.chunkMeta { - _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen) + _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) if err != nil { return err } @@ -98,10 +113,19 @@ func (c *chunkedContentCoder) flushContents() error { metaData := c.chunkMetaBuf.Bytes() c.final = append(c.final, c.chunkMetaBuf.Bytes()...) // write the compressed data to the final data - compressedData := snappy.Encode(nil, c.chunkBuf.Bytes()) - c.final = append(c.final, compressedData...) + c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) + c.final = append(c.final, c.compressed...) + + c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) + + if c.progressiveWrite { + _, err := c.w.Write(c.final) + if err != nil { + return err + } + c.final = c.final[:0] + } - c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData)) return nil } @@ -122,7 +146,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { c.currChunk = chunk } - // mark the starting offset for this doc + // get the starting offset for this doc dvOffset := c.chunkBuf.Len() dvSize, err := c.chunkBuf.Write(vals) if err != nil { @@ -130,38 +154,77 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { } c.chunkMeta = append(c.chunkMeta, MetaData{ - DocNum: docNum, - DocDvLoc: uint64(dvOffset), - DocDvLen: uint64(dvSize), + DocNum: docNum, + DocDvOffset: uint64(dvOffset + dvSize), }) return nil } // Write commits all the encoded chunked contents to the provided writer. -func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { +// +// | ..... data ..... | chunk offsets (varints) +// | position of chunk offsets (uint64) | number of offsets (uint64) | +// +func (c *chunkedContentCoder) Write() (int, error) { var tw int - buf := make([]byte, binary.MaxVarintLen64) - // write out the number of chunks - n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) - nw, err := w.Write(buf[:n]) - tw += nw - if err != nil { - return tw, err + + if c.final != nil { + // write out the data section first + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsStart := uint64(tw) + + if cap(c.final) < binary.MaxVarintLen64 { + c.final = make([]byte, binary.MaxVarintLen64) + } else { + c.final = c.final[0:binary.MaxVarintLen64] } - // write out the chunk lens - for _, chunkLen := range c.chunkLens { - n := binary.PutUvarint(buf, uint64(chunkLen)) - nw, err = w.Write(buf[:n]) + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + // write out the chunk offsets + for _, chunkOffset := range chunkOffsets { + n := binary.PutUvarint(c.final, chunkOffset) + nw, err := c.w.Write(c.final[:n]) tw += nw if err != nil { return tw, err } } - // write out the data - nw, err = w.Write(c.final) + + chunkOffsetsLen := uint64(tw) - chunkOffsetsStart + + c.final = c.final[0:8] + // write out the length of chunk offsets + binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + // write out the number of chunks + binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) + nw, err = c.w.Write(c.final) tw += nw if err != nil { return tw, err } + + c.final = c.final[:0] + return tw, nil } + +// ReadDocValueBoundary elicits the start, end offsets from a +// metaData header slice +func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = metaHeaders[chunk-1].DocDvOffset + } + return start, metaHeaders[chunk].DocDvOffset +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/count.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/count.go index d75e83c03..50290f888 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/count.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/count.go @@ -17,6 +17,8 @@ package zap import ( "hash/crc32" "io" + + "github.com/blevesearch/bleve/index/scorch/segment" ) // CountHashWriter is a wrapper around a Writer which counts the number of @@ -25,6 +27,7 @@ type CountHashWriter struct { w io.Writer crc uint32 n int + s segment.StatsReporter } // NewCountHashWriter returns a CountHashWriter which wraps the provided Writer @@ -32,11 +35,18 @@ func NewCountHashWriter(w io.Writer) *CountHashWriter { return &CountHashWriter{w: w} } +func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { + return &CountHashWriter{w: w, s: s} +} + // Write writes the provided bytes to the wrapped writer and counts the bytes func (c *CountHashWriter) Write(b []byte) (int, error) { n, err := c.w.Write(b) c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) c.n += n + if c.s != nil { + c.s.ReportBytesWritten(uint64(n)) + } return n, err } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/dict.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/dict.go index e5d712686..2c0e1bf2a 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/dict.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/dict.go @@ -15,38 +15,51 @@ package zap import ( + "bytes" "fmt" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/couchbase/vellum" - "github.com/couchbase/vellum/regexp" ) // Dictionary is the zap representation of the term dictionary type Dictionary struct { - sb *SegmentBase - field string - fieldID uint16 - fst *vellum.FST + sb *SegmentBase + field string + fieldID uint16 + fst *vellum.FST + fstReader *vellum.Reader } // PostingsList returns the postings list for the specified term -func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { - return d.postingsList([]byte(term), except, nil) +func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, + prealloc segment.PostingsList) (segment.PostingsList, error) { + var preallocPL *PostingsList + pl, ok := prealloc.(*PostingsList) + if ok && pl != nil { + preallocPL = pl + } + return d.postingsList(term, except, preallocPL) } func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { - if d.fst == nil { + if d.fstReader == nil { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } return d.postingsListInit(rv, except), nil } - postingsOffset, exists, err := d.fst.Get(term) + postingsOffset, exists, err := d.fstReader.Get(term) if err != nil { return nil, fmt.Errorf("vellum err: %v", err) } if !exists { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } return d.postingsListInit(rv, except), nil } @@ -65,10 +78,17 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari } func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { - if rv == nil { + if rv == nil || rv == emptyPostingsList { rv = &PostingsList{} } else { + postings := rv.postings + if postings != nil { + postings.Clear() + } + *rv = PostingsList{} // clear the struct + + rv.postings = postings } rv.sb = d.sb rv.except = except @@ -85,6 +105,8 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator { itr, err := d.fst.Iterator(nil, nil) if err == nil { rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err } } @@ -98,13 +120,15 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { d: d, } + kBeg := []byte(prefix) + kEnd := segment.IncrementBytes(kBeg) + if d.fst != nil { - r, err := regexp.New(prefix + ".*") + itr, err := d.fst.Iterator(kBeg, kEnd) if err == nil { - itr, err := d.fst.Search(r, nil, nil) - if err == nil { - rv.itr = itr - } + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err } } @@ -130,36 +154,103 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator itr, err := d.fst.Iterator([]byte(start), endBytes) if err == nil { rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +// AutomatonIterator returns an iterator which only visits terms +// having the the vellum automaton and start/end key range +func (d *Dictionary) AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, + includeCount bool) segment.DictionaryIterator { + + rv := &DictionaryIterator{ + d: d, + omitCount: !includeCount, + } + + var buf bytes.Buffer + builder, err := vellum.New(&buf, nil) + if err != nil { + rv.err = err + return rv + } + for _, term := range onlyTerms { + err = builder.Insert(term, 0) + if err != nil { + rv.err = err + return rv } } + err = builder.Close() + if err != nil { + rv.err = err + return rv + } + + onlyFST, err := vellum.Load(buf.Bytes()) + if err != nil { + rv.err = err + return rv + } + + itr, err := d.fst.Search(onlyFST, nil, nil) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } return rv } // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { - d *Dictionary - itr vellum.Iterator - err error - tmp PostingsList + d *Dictionary + itr vellum.Iterator + err error + tmp PostingsList + entry index.DictEntry + omitCount bool } // Next returns the next entry in the dictionary func (i *DictionaryIterator) Next() (*index.DictEntry, error) { - if i.itr == nil || i.err == vellum.ErrIteratorDone { - return nil, nil - } else if i.err != nil { + if i.err != nil && i.err != vellum.ErrIteratorDone { return nil, i.err + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil } term, postingsOffset := i.itr.Current() - i.err = i.tmp.read(postingsOffset, i.d) - if i.err != nil { - return nil, i.err - } - rv := &index.DictEntry{ - Term: string(term), - Count: i.tmp.Count(), + i.entry.Term = string(term) + if !i.omitCount { + i.err = i.tmp.read(postingsOffset, i.d) + if i.err != nil { + return nil, i.err + } + i.entry.Count = i.tmp.Count() } i.err = i.itr.Next() - return rv, nil + return &i.entry, nil } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go index 0514bd307..bcc0f9472 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go @@ -19,93 +19,129 @@ import ( "encoding/binary" "fmt" "math" + "reflect" "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" "github.com/golang/snappy" ) -type docValueIterator struct { +var reflectStaticSizedocValueReader int + +func init() { + var dvi docValueReader + reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) +} + +type docNumTermsVisitor func(docNum uint64, terms []byte) error + +type docVisitState struct { + dvrs map[uint16]*docValueReader + segment *Segment +} + +type docValueReader struct { field string curChunkNum uint64 - numChunks uint64 - chunkLens []uint64 + chunkOffsets []uint64 dvDataLoc uint64 curChunkHeader []MetaData curChunkData []byte // compressed data cache + uncompressed []byte // temp buf for snappy decompression } -func (di *docValueIterator) sizeInBytes() uint64 { - // curChunkNum, numChunks, dvDataLoc --> uint64 - sizeInBytes := 24 - - // field - sizeInBytes += (len(di.field) + int(segment.SizeOfString)) +func (di *docValueReader) size() int { + return reflectStaticSizedocValueReader + size.SizeOfPtr + + len(di.field) + + len(di.chunkOffsets)*size.SizeOfUint64 + + len(di.curChunkHeader)*reflectStaticSizeMetaData + + len(di.curChunkData) +} - // chunkLens, curChunkHeader - sizeInBytes += len(di.chunkLens)*8 + - len(di.curChunkHeader)*24 + - int(segment.SizeOfSlice*2) /* overhead from slices */ +func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { + if rv == nil { + rv = &docValueReader{} + } - // curChunkData is mmap'ed, not included + rv.field = di.field + rv.curChunkNum = math.MaxUint64 + rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable + rv.dvDataLoc = di.dvDataLoc + rv.curChunkHeader = rv.curChunkHeader[:0] + rv.curChunkData = nil + rv.uncompressed = rv.uncompressed[:0] - return uint64(sizeInBytes) + return rv } -func (di *docValueIterator) fieldName() string { +func (di *docValueReader) fieldName() string { return di.field } -func (di *docValueIterator) curChunkNumber() uint64 { +func (di *docValueReader) curChunkNumber() uint64 { return di.curChunkNum } -func (s *SegmentBase) loadFieldDocValueIterator(field string, - fieldDvLoc uint64) (*docValueIterator, error) { +func (s *SegmentBase) loadFieldDocValueReader(field string, + fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { // get the docValue offset for the given fields - if fieldDvLoc == fieldNotUninverted { - return nil, fmt.Errorf("loadFieldDocValueIterator: "+ + if fieldDvLocStart == fieldNotUninverted { + return nil, fmt.Errorf("loadFieldDocValueReader: "+ "no docValues found for field: %s", field) } - // read the number of chunks, chunk lengths - var offset, clen uint64 - numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) - if read <= 0 { - return nil, fmt.Errorf("failed to read the field "+ - "doc values for field %s", field) + // read the number of chunks, and chunk offsets position + var numChunks, chunkOffsetsPosition uint64 + + if fieldDvLocEnd-fieldDvLocStart > 16 { + numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) + // read the length of chunk offsets + chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) + // acquire position of chunk offsets + chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen } - offset += uint64(read) - fdvIter := &docValueIterator{ - curChunkNum: math.MaxUint64, - field: field, - chunkLens: make([]uint64, int(numChunks)), + fdvIter := &docValueReader{ + curChunkNum: math.MaxUint64, + field: field, + chunkOffsets: make([]uint64, int(numChunks)), } + + // read the chunk offsets + var offset uint64 for i := 0; i < int(numChunks); i++ { - clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) if read <= 0 { - return nil, fmt.Errorf("corrupted chunk length during segment load") + return nil, fmt.Errorf("corrupted chunk offset during segment load") } - fdvIter.chunkLens[i] = clen + fdvIter.chunkOffsets[i] = loc offset += uint64(read) } - fdvIter.dvDataLoc = fieldDvLoc + offset + // set the data offset + fdvIter.dvDataLoc = fieldDvLocStart + return fdvIter, nil } -func (di *docValueIterator) loadDvChunk(chunkNumber, - localDocNum uint64, s *SegmentBase) error { +func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { // advance to the chunk where the docValues // reside for the given docNum - destChunkDataLoc := di.dvDataLoc - for i := 0; i < int(chunkNumber); i++ { - destChunkDataLoc += di.chunkLens[i] + destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc + start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) + if start >= end { + di.curChunkHeader = di.curChunkHeader[:0] + di.curChunkData = nil + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil } - curChunkSize := di.chunkLens[chunkNumber] + destChunkDataLoc += start + curChunkEnd += end + // read the number of docs reside in the chunk numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) if read <= 0 { @@ -114,38 +150,81 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, chunkMetaLoc := destChunkDataLoc + uint64(read) offset := uint64(0) - di.curChunkHeader = make([]MetaData, int(numDocs)) + if cap(di.curChunkHeader) < int(numDocs) { + di.curChunkHeader = make([]MetaData, int(numDocs)) + } else { + di.curChunkHeader = di.curChunkHeader[:int(numDocs)] + } for i := 0; i < int(numDocs); i++ { di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) - di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(read) - di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) } compressedDataLoc := chunkMetaLoc + offset - dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc + dataLength := curChunkEnd - compressedDataLoc di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil +} + +func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { + for i := 0; i < len(di.chunkOffsets); i++ { + err := di.loadDvChunk(uint64(i), s) + if err != nil { + return err + } + if di.curChunkData == nil || len(di.curChunkHeader) == 0 { + continue + } + + // uncompress the already loaded data + uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + + start := uint64(0) + for _, entry := range di.curChunkHeader { + err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) + if err != nil { + return err + } + + start = entry.DocDvOffset + } + } + return nil } -func (di *docValueIterator) visitDocValues(docNum uint64, +func (di *docValueReader) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { // binary search the term locations for the docNum - start, length := di.getDocValueLocs(docNum) - if start == math.MaxUint64 || length == math.MaxUint64 { + start, end := di.getDocValueLocs(docNum) + if start == math.MaxUint64 || end == math.MaxUint64 || start == end { return nil } - // uncompress the already loaded data - uncompressed, err := snappy.Decode(nil, di.curChunkData) - if err != nil { - return err + + var uncompressed []byte + var err error + // use the uncompressed copy if available + if len(di.uncompressed) > 0 { + uncompressed = di.uncompressed + } else { + // uncompress the already loaded data + uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed } // pick the terms for the given docNum - uncompressed = uncompressed[start : start+length] + uncompressed = uncompressed[start:end] for { i := bytes.Index(uncompressed, termSeparatorSplitSlice) if i < 0 { @@ -159,55 +238,72 @@ func (di *docValueIterator) visitDocValues(docNum uint64, return nil } -func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) { +func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { i := sort.Search(len(di.curChunkHeader), func(i int) bool { return di.curChunkHeader[i].DocNum >= docNum }) if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { - return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen + return ReadDocValueBoundary(i, di.curChunkHeader) } return math.MaxUint64, math.MaxUint64 } // VisitDocumentFieldTerms is an implementation of the // DocumentFieldTermVisitable interface -func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, - visitor index.DocumentFieldTermVisitor) error { - fieldIDPlus1 := uint16(0) - ok := true +func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, + visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) ( + segment.DocVisitState, error) { + dvs, ok := dvsIn.(*docVisitState) + if !ok || dvs == nil { + dvs = &docVisitState{} + } else { + if dvs.segment != s { + dvs.segment = s + dvs.dvrs = nil + } + } + + var fieldIDPlus1 uint16 + if dvs.dvrs == nil { + dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) + for _, field := range fields { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { + continue + } + fieldID := fieldIDPlus1 - 1 + if dvIter, exists := s.fieldDvReaders[fieldID]; exists && + dvIter != nil { + dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) + } + } + } + + // find the chunkNumber where the docValues are stored + docInChunk := localDocNum / uint64(s.chunkFactor) + var dvr *docValueReader for _, field := range fields { if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { continue } - // find the chunkNumber where the docValues are stored - docInChunk := localDocNum / uint64(s.chunkFactor) - - if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists && - dvIter != nil { + fieldID := fieldIDPlus1 - 1 + if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { // check if the chunk is already loaded - if docInChunk != dvIter.curChunkNumber() { - err := dvIter.loadDvChunk(docInChunk, localDocNum, s) + if docInChunk != dvr.curChunkNumber() { + err := dvr.loadDvChunk(docInChunk, &s.SegmentBase) if err != nil { - continue + return dvs, err } } - _ = dvIter.visitDocValues(localDocNum, visitor) + _ = dvr.visitDocValues(localDocNum, visitor) } } - return nil + return dvs, nil } // VisitableDocValueFields returns the list of fields with // persisted doc value terms ready to be visitable using the // VisitDocumentFieldTerms method. func (s *Segment) VisitableDocValueFields() ([]string, error) { - var rv []string - for fieldID, field := range s.fieldsInv { - if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok && - dvIter != nil { - rv = append(rv, field) - } - } - return rv, nil + return s.fieldDvNames, nil } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/enumerator.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/enumerator.go index 3c708dd57..cd6ff73c7 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/enumerator.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/enumerator.go @@ -46,26 +46,27 @@ func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { for i, itr := range rv.itrs { rv.currKs[i], rv.currVs[i] = itr.Current() } - rv.updateMatches() - if rv.lowK == nil { + rv.updateMatches(false) + if rv.lowK == nil && len(rv.lowIdxs) == 0 { return rv, vellum.ErrIteratorDone } return rv, nil } // updateMatches maintains the low key matches based on the currKs -func (m *enumerator) updateMatches() { +func (m *enumerator) updateMatches(skipEmptyKey bool) { m.lowK = nil m.lowIdxs = m.lowIdxs[:0] m.lowCurr = 0 for i, key := range m.currKs { - if key == nil { + if (key == nil && m.currVs[i] == 0) || // in case of empty iterator + (len(key) == 0 && skipEmptyKey) { // skip empty keys continue } cmp := bytes.Compare(key, m.lowK) - if cmp < 0 || m.lowK == nil { + if cmp < 0 || len(m.lowIdxs) == 0 { // reached a new low m.lowK = key m.lowIdxs = m.lowIdxs[:0] @@ -102,9 +103,10 @@ func (m *enumerator) Next() error { } m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() } - m.updateMatches() + // can skip any empty keys encountered at this point + m.updateMatches(true) } - if m.lowK == nil { + if m.lowK == nil && len(m.lowIdxs) == 0 { return vellum.ErrIteratorDone } return nil diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/intcoder.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/intcoder.go index b505fec94..571d06edb 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/intcoder.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/intcoder.go @@ -18,16 +18,12 @@ import ( "bytes" "encoding/binary" "io" - - "github.com/Smerity/govarint" ) type chunkedIntCoder struct { final []byte - maxDocNum uint64 chunkSize uint64 chunkBuf bytes.Buffer - encoder *govarint.Base128Encoder chunkLens []uint64 currChunk uint64 @@ -41,11 +37,9 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { total := maxDocNum/chunkSize + 1 rv := &chunkedIntCoder{ chunkSize: chunkSize, - maxDocNum: maxDocNum, chunkLens: make([]uint64, total), final: make([]byte, 0, 64), } - rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf) return rv } @@ -67,16 +61,18 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { chunk := docNum / c.chunkSize if chunk != c.currChunk { // starting a new chunk - if c.encoder != nil { - // close out last - c.Close() - c.chunkBuf.Reset() - } + c.Close() + c.chunkBuf.Reset() c.currChunk = chunk } + if len(c.buf) < binary.MaxVarintLen64 { + c.buf = make([]byte, binary.MaxVarintLen64) + } + for _, val := range vals { - _, err := c.encoder.PutU64(val) + wb := binary.PutUvarint(c.buf, val) + _, err := c.chunkBuf.Write(c.buf[:wb]) if err != nil { return err } @@ -85,13 +81,26 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { return nil } +func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + _, err := c.chunkBuf.Write(buf) + return err +} + // Close indicates you are done calling Add() this allows the final chunk // to be encoded. func (c *chunkedIntCoder) Close() { - c.encoder.Close() encodingBytes := c.chunkBuf.Bytes() c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) c.final = append(c.final, encodingBytes...) + c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close } // Write commits all the encoded chunked integers to the provided writer. @@ -102,10 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { } buf := c.buf - // write out the number of chunks & each chunkLen - n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) - for _, chunkLen := range c.chunkLens { - n += binary.PutUvarint(buf[n:], uint64(chunkLen)) + // convert the chunk lengths into chunk offsets + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + + // write out the number of chunks & each chunk offsets + n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) + for _, chunkOffset := range chunkOffsets { + n += binary.PutUvarint(buf[n:], chunkOffset) } tw, err := w.Write(buf[:n]) @@ -121,3 +133,40 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { } return tw, nil } + +func (c *chunkedIntCoder) FinalSize() int { + return len(c.final) +} + +// modifyLengthsToEndOffsets converts the chunk length array +// to a chunk offset array. The readChunkBoundary +// will figure out the start and end of every chunk from +// these offsets. Starting offset of i'th index is stored +// in i-1'th position except for 0'th index and ending offset +// is stored at i'th index position. +// For 0'th element, starting position is always zero. +// eg: +// Lens -> 5 5 5 5 => 5 10 15 20 +// Lens -> 0 5 0 5 => 0 5 5 10 +// Lens -> 0 0 0 5 => 0 0 0 5 +// Lens -> 5 0 0 0 => 5 5 5 5 +// Lens -> 0 5 0 0 => 0 5 5 5 +// Lens -> 0 0 5 0 => 0 0 5 5 +func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { + var runningOffset uint64 + var index, i int + for i = 1; i <= len(lengths); i++ { + runningOffset += lengths[i-1] + lengths[index] = runningOffset + index++ + } + return lengths +} + +func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = offsets[chunk-1] + } + return start, offsets[chunk] +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/merge.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/merge.go index ae8c5b197..4ef222c1a 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/merge.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/merge.go @@ -24,11 +24,13 @@ import ( "sort" "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" + seg "github.com/blevesearch/bleve/index/scorch/segment" "github.com/couchbase/vellum" "github.com/golang/snappy" ) +var DefaultFileMergerBufferSize = 1024 * 1024 + const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of zap segments and bit masks describing which @@ -36,12 +38,24 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // remaining data. This new segment is built at the specified path, // with the provided chunkFactor. func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, - chunkFactor uint32) ([][]uint64, error) { + chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { + segmentBases := make([]*SegmentBase, len(segments)) + for segmenti, segment := range segments { + segmentBases[segmenti] = &segment.SegmentBase + } + + return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s) +} + +func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, + chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { flag := os.O_RDWR | os.O_CREATE f, err := os.OpenFile(path, flag, 0600) if err != nil { - return nil, err + return nil, 0, err } cleanup := func() { @@ -49,54 +63,49 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, _ = os.Remove(path) } - segmentBases := make([]*SegmentBase, len(segments)) - for segmenti, segment := range segments { - segmentBases[segmenti] = &segment.SegmentBase - } - // buffer the output - br := bufio.NewWriter(f) + br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) // wrap it for counting (tracking offsets) - cr := NewCountHashWriter(br) + cr := NewCountHashWriterWithStatsReporter(br, s) newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := - MergeToWriter(segmentBases, drops, chunkFactor, cr) + MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh) if err != nil { cleanup() - return nil, err + return nil, 0, err } err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, chunkFactor, cr.Sum32(), cr) if err != nil { cleanup() - return nil, err + return nil, 0, err } err = br.Flush() if err != nil { cleanup() - return nil, err + return nil, 0, err } err = f.Sync() if err != nil { cleanup() - return nil, err + return nil, 0, err } err = f.Close() if err != nil { cleanup() - return nil, err + return nil, 0, err } - return newDocNums, nil + return newDocNums, uint64(cr.Count()), nil } func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, - chunkFactor uint32, cr *CountHashWriter) ( + chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) ( newDocNums [][]uint64, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, @@ -108,15 +117,21 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap = mapFields(fieldsInv) numDocs = computeNewDocCount(segments, drops) + + if isClosed(closeCh) { + return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed + } + if numDocs > 0 { storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, - fieldsMap, fieldsInv, fieldsSame, numDocs, cr) + fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh) if err != nil { return nil, 0, 0, 0, 0, nil, nil, nil, err } - dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, - newDocNums, numDocs, chunkFactor, cr) + dictLocs, docValueOffset, err = persistMergedRest(segments, drops, + fieldsInv, fieldsMap, fieldsSame, + newDocNums, numDocs, chunkFactor, cr, closeCh) if err != nil { return nil, 0, 0, 0, 0, nil, nil, nil, err } @@ -156,11 +171,10 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 } func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, - fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64, - newSegDocCount uint64, chunkFactor uint32, - w *CountHashWriter) ([]uint64, uint64, error) { + fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, + newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32, + w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) { - var bufReuse bytes.Buffer var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var bufLoc []uint64 @@ -168,28 +182,22 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var postItr *PostingsIterator rv := make([]uint64, len(fieldsInv)) - fieldDvLocs := make([]uint64, len(fieldsInv)) + fieldDvLocsStart := make([]uint64, len(fieldsInv)) + fieldDvLocsEnd := make([]uint64, len(fieldsInv)) tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - // docTermMap is keyed by docNum, where the array impl provides - // better memory usage behavior than a sparse-friendlier hashmap - // for when docs have much structural similarity (i.e., every doc - // has a given field) - var docTermMap [][]byte - var vellumBuf bytes.Buffer + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, 0, err + } + + newRoaring := roaring.NewBitmap() // for each field for fieldID, fieldName := range fieldsInv { - if fieldID != 0 { - vellumBuf.Reset() - } - newVellum, err := vellum.New(&vellumBuf, nil) - if err != nil { - return nil, 0, err - } // collect FST iterators from all active segments for this field var newDocNums [][]uint64 @@ -197,7 +205,15 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var dicts []*Dictionary var itrs []vellum.Iterator + var segmentsInFocus []*SegmentBase + for segmentI, segment := range segments { + + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + dict, err2 := segment.dictionary(fieldName) if err2 != nil { return nil, 0, err2 @@ -209,89 +225,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } if itr != nil { newDocNums = append(newDocNums, newDocNumsIn[segmentI]) - drops = append(drops, dropsIn[segmentI]) + if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { + drops = append(drops, dropsIn[segmentI]) + } else { + drops = append(drops, nil) + } dicts = append(dicts, dict) itrs = append(itrs, itr) + segmentsInFocus = append(segmentsInFocus, segment) } } } - if uint64(cap(docTermMap)) < newSegDocCount { - docTermMap = make([][]byte, newSegDocCount) - } else { - docTermMap = docTermMap[0:newSegDocCount] - for docNum := range docTermMap { // reset the docTermMap - docTermMap[docNum] = docTermMap[docNum][:0] - } - } - var prevTerm []byte - newRoaring := roaring.NewBitmap() - newRoaringLocs := roaring.NewBitmap() + newRoaring.Clear() - finishTerm := func(term []byte) error { - if term == nil { - return nil + var lastDocNum, lastFreq, lastNorm uint64 + + // determines whether to use "1-hit" encoding optimization + // when a term appears in only 1 doc, with no loc info, + // has freq of 1, and the docNum fits into 31-bits + use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { + if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { + docNum := uint64(newRoaring.Minimum()) + if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { + return true, docNum, lastNorm + } } + return false, 0, 0 + } + finishTerm := func(term []byte) error { tfEncoder.Close() locEncoder.Close() - if newRoaring.GetCardinality() > 0 { - // this field/term actually has hits in the new segment, lets write it down - freqOffset := uint64(w.Count()) - _, err := tfEncoder.Write(w) - if err != nil { - return err - } - locOffset := uint64(w.Count()) - _, err = locEncoder.Write(w) - if err != nil { - return err - } - postingLocOffset := uint64(w.Count()) - _, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64) - if err != nil { - return err - } - postingOffset := uint64(w.Count()) - - // write out the start of the term info - n := binary.PutUvarint(bufMaxVarintLen64, freqOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return err - } - // write out the start of the loc info - n = binary.PutUvarint(bufMaxVarintLen64, locOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return err - } - // write out the start of the posting locs - n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return err - } - _, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64) - if err != nil { - return err - } + postingsOffset, err := writePostings(newRoaring, + tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) + if err != nil { + return err + } - err = newVellum.Insert(term, postingOffset) + if postingsOffset > 0 { + err = newVellum.Insert(term, postingsOffset) if err != nil { return err } } - newRoaring = roaring.NewBitmap() - newRoaringLocs = roaring.NewBitmap() + newRoaring.Clear() tfEncoder.Reset() locEncoder.Reset() + lastDocNum = 0 + lastFreq = 0 + lastNorm = 0 + return nil } @@ -301,66 +291,39 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, term, itrI, postingsOffset := enumerator.Current() if !bytes.Equal(prevTerm, term) { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + // if the term changed, write out the info collected // for the previous term - err2 := finishTerm(prevTerm) - if err2 != nil { - return nil, 0, err2 + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err } } - var err2 error - postings, err2 = dicts[itrI].postingsListFromOffset( + postings, err = dicts[itrI].postingsListFromOffset( postingsOffset, drops[itrI], postings) - if err2 != nil { - return nil, 0, err2 + if err != nil { + return nil, 0, err } - newDocNumsI := newDocNums[itrI] - - postItr = postings.iterator(postItr) - next, err2 := postItr.Next() - for next != nil && err2 == nil { - hitNewDocNum := newDocNumsI[next.Number()] - if hitNewDocNum == docDropped { - return nil, 0, fmt.Errorf("see hit with dropped doc num") - } - newRoaring.Add(uint32(hitNewDocNum)) - // encode norm bits - norm := next.Norm() - normBits := math.Float32bits(float32(norm)) - err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) - if err != nil { - return nil, 0, err - } - locs := next.Locations() - if len(locs) > 0 { - newRoaringLocs.Add(uint32(hitNewDocNum)) - for _, loc := range locs { - if cap(bufLoc) < 5+len(loc.ArrayPositions()) { - bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) - } - args := bufLoc[0:5] - args[0] = uint64(fieldsMap[loc.Field()] - 1) - args[1] = loc.Pos() - args[2] = loc.Start() - args[3] = loc.End() - args[4] = uint64(len(loc.ArrayPositions())) - args = append(args, loc.ArrayPositions()...) - err = locEncoder.Add(hitNewDocNum, args...) - if err != nil { - return nil, 0, err - } - } - } - - docTermMap[hitNewDocNum] = - append(append(docTermMap[hitNewDocNum], term...), termSeparator) - - next, err2 = postItr.Next() + postItr = postings.iterator(true, true, true, postItr) + + if fieldsSame { + // can optimize by copying freq/norm/loc bytes directly + lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( + term, postItr, newDocNums[itrI], newRoaring, + tfEncoder, locEncoder) + } else { + lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( + fieldsMap, term, postItr, newDocNums[itrI], newRoaring, + tfEncoder, locEncoder, bufLoc) } - if err2 != nil { - return nil, 0, err2 + if err != nil { + return nil, 0, err } prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem @@ -368,7 +331,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, err = enumerator.Next() } - if err != nil && err != vellum.ErrIteratorDone { + if err != vellum.ErrIteratorDone { return nil, 0, err } @@ -400,26 +363,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, rv[fieldID] = dictOffset + // get the field doc value offset (start) + fieldDvLocsStart[fieldID] = uint64(w.Count()) + // update the field doc values - fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) - for docNum, docTerms := range docTermMap { - if len(docTerms) > 0 { - err = fdvEncoder.Add(uint64(docNum), docTerms) + fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true) + + fdvReadersAvailable := false + var dvIterClone *docValueReader + for segmentI, segment := range segmentsInFocus { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) + if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && + dvIter != nil { + fdvReadersAvailable = true + dvIterClone = dvIter.cloneInto(dvIterClone) + err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { + if newDocNums[segmentI][docNum] == docDropped { + return nil + } + err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) + if err != nil { + return err + } + return nil + }) if err != nil { return nil, 0, err } } } - err = fdvEncoder.Close() - if err != nil { - return nil, 0, err - } - // get the field doc value offset - fieldDvLocs[fieldID] = uint64(w.Count()) + if fdvReadersAvailable { + err = fdvEncoder.Close() + if err != nil { + return nil, 0, err + } + + // persist the doc value details for this field + _, err = fdvEncoder.Write() + if err != nil { + return nil, 0, err + } + + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + } else { + fieldDvLocsStart[fieldID] = fieldNotUninverted + fieldDvLocsEnd[fieldID] = fieldNotUninverted + } - // persist the doc value details for this field - _, err = fdvEncoder.Write(w) + // reset vellum buffer and vellum builder + vellumBuf.Reset() + err = newVellum.Reset(&vellumBuf) if err != nil { return nil, 0, err } @@ -428,38 +428,210 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, fieldDvLocsOffset := uint64(w.Count()) buf := bufMaxVarintLen64 - for _, offset := range fieldDvLocs { - n := binary.PutUvarint(buf, uint64(offset)) + for i := 0; i < len(fieldDvLocsStart); i++ { + n := binary.PutUvarint(buf, fieldDvLocsStart[i]) _, err := w.Write(buf[:n]) if err != nil { return nil, 0, err } + n = binary.PutUvarint(buf, fieldDvLocsEnd[i]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, 0, err + } } return rv, fieldDvLocsOffset, nil } +func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { + next, err := postItr.Next() + for next != nil && err == nil { + hitNewDocNum := newDocNums[next.Number()] + if hitNewDocNum == docDropped { + return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + nextFreq := next.Frequency() + nextNorm := uint64(math.Float32bits(float32(next.Norm()))) + + locs := next.Locations() + + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) + if err != nil { + return 0, 0, 0, nil, err + } + + if len(locs) > 0 { + numBytesLocs := 0 + for _, loc := range locs { + ap := loc.ArrayPositions() + numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), + loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) + } + + err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) + if err != nil { + return 0, 0, 0, nil, err + } + + for _, loc := range locs { + ap := loc.ArrayPositions() + if cap(bufLoc) < 5+len(ap) { + bufLoc = make([]uint64, 0, 5+len(ap)) + } + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(ap)) + args = append(args, ap...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return 0, 0, 0, nil, err + } + } + } + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + next, err = postItr.Next() + } + + return lastDocNum, lastFreq, lastNorm, bufLoc, err +} + +func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := + postItr.nextBytes() + for err == nil && len(nextFreqNormBytes) > 0 { + hitNewDocNum := newDocNums[nextDocNum] + if hitNewDocNum == docDropped { + return 0, 0, 0, fmt.Errorf("see hit with dropped doc num") + } + + newRoaring.Add(uint32(hitNewDocNum)) + err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) + if err != nil { + return 0, 0, 0, err + } + + if len(nextLocBytes) > 0 { + err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) + if err != nil { + return 0, 0, 0, err + } + } + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err = + postItr.nextBytes() + } + + return lastDocNum, lastFreq, lastNorm, err +} + +func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, + use1HitEncoding func(uint64) (bool, uint64, uint64), + w *CountHashWriter, bufMaxVarintLen64 []byte) ( + offset uint64, err error) { + termCardinality := postings.GetCardinality() + if termCardinality <= 0 { + return 0, nil + } + + if use1HitEncoding != nil { + encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) + if encodeAs1Hit { + return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil + } + } + + tfOffset := uint64(w.Count()) + _, err = tfEncoder.Write(w) + if err != nil { + return 0, err + } + + locOffset := uint64(w.Count()) + _, err = locEncoder.Write(w) + if err != nil { + return 0, err + } + + postingsOffset := uint64(w.Count()) + + n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, locOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + + return postingsOffset, nil +} + +type varintEncoder func(uint64) (int, error) + func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, - w *CountHashWriter) (uint64, [][]uint64, error) { + w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { var rv [][]uint64 // The remapped or newDocNums for each segment. var newDocNum uint64 var curr int - var metaBuf bytes.Buffer var data, compressed []byte - - metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + var metaBuf bytes.Buffer + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return metaBuf.Write(varBuf[:wb]) + } vals := make([][][]byte, len(fieldsInv)) typs := make([][]byte, len(fieldsInv)) poss := make([][][]uint64, len(fieldsInv)) + var posBuf []uint64 + docNumOffsets := make([]uint64, newSegDocCount) + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + // for each segment for segI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return 0, nil, seg.ErrClosed + } + segNewDocNums := make([]uint64, segment.numDocs) dropsI := drops[segI] @@ -495,7 +667,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, curr = 0 metaBuf.Reset() data = data[:0] - compressed = compressed[:0] + + posTemp := posBuf // collect all the data for i := 0; i < len(fieldsInv); i++ { @@ -503,42 +676,63 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, typs[i] = typs[i][:0] poss[i] = poss[i][:0] } - err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { fieldID := int(fieldsMap[field]) - 1 vals[fieldID] = append(vals[fieldID], value) typs[fieldID] = append(typs[fieldID], typ) - poss[fieldID] = append(poss[fieldID], pos) + + // copy array positions to preserve them beyond the scope of this callback + var curPos []uint64 + if len(pos) > 0 { + if cap(posTemp) < len(pos) { + posBuf = make([]uint64, len(pos)*len(fieldsInv)) + posTemp = posBuf + } + curPos = posTemp[0:len(pos)] + copy(curPos, pos) + posTemp = posTemp[len(pos):] + } + poss[fieldID] = append(poss[fieldID], curPos) + return true }) if err != nil { return 0, nil, err } - // now walk the fields in order - for fieldID := range fieldsInv { - storedFieldValues := vals[int(fieldID)] + // _id field special case optimizes ExternalID() lookups + idFieldVal := vals[uint16(0)][0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, nil, err + } + + // now walk the non-"_id" fields in order + for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { + storedFieldValues := vals[fieldID] - stf := typs[int(fieldID)] - spf := poss[int(fieldID)] + stf := typs[fieldID] + spf := poss[fieldID] var err2 error curr, data, err2 = persistStoredFieldValues(fieldID, - storedFieldValues, stf, spf, curr, metaEncoder, data) + storedFieldValues, stf, spf, curr, metaEncode, data) if err2 != nil { return 0, nil, err2 } } - metaEncoder.Close() metaBytes := metaBuf.Bytes() - compressed = snappy.Encode(compressed, data) + compressed = snappy.Encode(compressed[:cap(compressed)], data) // record where we're about to start writing docNumOffsets[newDocNum] = uint64(w.Count()) // write out the meta len and compressed data len - _, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) + _, err = writeUvarints(w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) if err != nil { return 0, nil, err } @@ -547,6 +741,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, if err != nil { return 0, nil, err } + // now write the _id field val (counted as part of the 'compressed' data) + _, err = w.Write(idFieldVal) + if err != nil { + return 0, nil, err + } // now write the compressed data _, err = w.Write(compressed) if err != nil { @@ -644,3 +843,12 @@ func mergeFields(segments []*SegmentBase) (bool, []string) { return fieldsSame, rv } + +func isClosed(closeCh chan struct{}) bool { + select { + case <-closeCh: + return true + default: + return false + } +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go new file mode 100644 index 000000000..22b69913e --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go @@ -0,0 +1,826 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "math" + "sort" + "sync" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/couchbase/vellum" + "github.com/golang/snappy" +) + +var NewSegmentBufferNumResultsBump int = 100 +var NewSegmentBufferNumResultsFactor float64 = 1.0 +var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 + +// AnalysisResultsToSegmentBase produces an in-memory zap-encoded +// SegmentBase from analysis results +func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, + chunkFactor uint32) (*SegmentBase, uint64, error) { + s := interimPool.Get().(*interim) + + var br bytes.Buffer + if s.lastNumDocs > 0 { + // use previous results to initialize the buf with an estimate + // size, but note that the interim instance comes from a + // global interimPool, so multiple scorch instances indexing + // different docs can lead to low quality estimates + estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * + NewSegmentBufferNumResultsFactor) + estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * + NewSegmentBufferAvgBytesPerDocFactor) + br.Grow(estimateAvgBytesPerDoc * estimateNumResults) + } + + s.results = results + s.chunkFactor = chunkFactor + s.w = NewCountHashWriter(&br) + + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, + err := s.convert() + if err != nil { + return nil, uint64(0), err + } + + sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, + s.FieldsMap, s.FieldsInv, uint64(len(results)), + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + + if err == nil && s.reset() == nil { + s.lastNumDocs = len(results) + s.lastOutSize = len(br.Bytes()) + interimPool.Put(s) + } + + return sb, uint64(len(br.Bytes())), err +} + +var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} + +// interim holds temporary working data used while converting from +// analysis results to a zap-encoded segment +type interim struct { + results []*index.AnalysisResult + + chunkFactor uint32 + + w *CountHashWriter + + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 + FieldsMap map[string]uint16 + + // FieldsInv is the inverse of FieldsMap + // field id -> name + FieldsInv []string + + // Term dictionaries for each field + // field id -> term -> postings list id + 1 + Dicts []map[string]uint64 + + // Terms for each field, where terms are sorted ascending + // field id -> []term + DictKeys [][]string + + // Fields whose IncludeDocValues is true + // field id -> bool + IncludeDocValues []bool + + // postings id -> bitmap of docNums + Postings []*roaring.Bitmap + + // postings id -> freq/norm's, one for each docNum in postings + FreqNorms [][]interimFreqNorm + freqNormsBacking []interimFreqNorm + + // postings id -> locs, one for each freq + Locs [][]interimLoc + locsBacking []interimLoc + + numTermsPerPostingsList []int // key is postings list id + numLocsPerPostingsList []int // key is postings list id + + builder *vellum.Builder + builderBuf bytes.Buffer + + metaBuf bytes.Buffer + + tmp0 []byte + tmp1 []byte + + lastNumDocs int + lastOutSize int +} + +func (s *interim) reset() (err error) { + s.results = nil + s.chunkFactor = 0 + s.w = nil + s.FieldsMap = nil + s.FieldsInv = nil + for i := range s.Dicts { + s.Dicts[i] = nil + } + s.Dicts = s.Dicts[:0] + for i := range s.DictKeys { + s.DictKeys[i] = s.DictKeys[i][:0] + } + s.DictKeys = s.DictKeys[:0] + for i := range s.IncludeDocValues { + s.IncludeDocValues[i] = false + } + s.IncludeDocValues = s.IncludeDocValues[:0] + for _, idn := range s.Postings { + idn.Clear() + } + s.Postings = s.Postings[:0] + s.FreqNorms = s.FreqNorms[:0] + for i := range s.freqNormsBacking { + s.freqNormsBacking[i] = interimFreqNorm{} + } + s.freqNormsBacking = s.freqNormsBacking[:0] + s.Locs = s.Locs[:0] + for i := range s.locsBacking { + s.locsBacking[i] = interimLoc{} + } + s.locsBacking = s.locsBacking[:0] + s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] + s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] + s.builderBuf.Reset() + if s.builder != nil { + err = s.builder.Reset(&s.builderBuf) + } + s.metaBuf.Reset() + s.tmp0 = s.tmp0[:0] + s.tmp1 = s.tmp1[:0] + s.lastNumDocs = 0 + s.lastOutSize = 0 + + return err +} + +func (s *interim) grabBuf(size int) []byte { + buf := s.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + s.tmp0 = buf + } + return buf[0:size] +} + +type interimStoredField struct { + vals [][]byte + typs []byte + arrayposs [][]uint64 // array positions +} + +type interimFreqNorm struct { + freq uint64 + norm float32 + numLocs int +} + +type interimLoc struct { + fieldID uint16 + pos uint64 + start uint64 + end uint64 + arrayposs []uint64 +} + +func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { + s.FieldsMap = map[string]uint16{} + + s.getOrDefineField("_id") // _id field is fieldID 0 + + for _, result := range s.results { + for _, field := range result.Document.CompositeFields { + s.getOrDefineField(field.Name()) + } + for _, field := range result.Document.Fields { + s.getOrDefineField(field.Name()) + } + } + + sort.Strings(s.FieldsInv[1:]) // keep _id as first field + + for fieldID, fieldName := range s.FieldsInv { + s.FieldsMap[fieldName] = uint16(fieldID + 1) + } + + if cap(s.IncludeDocValues) >= len(s.FieldsInv) { + s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] + } else { + s.IncludeDocValues = make([]bool, len(s.FieldsInv)) + } + + s.prepareDicts() + + for _, dict := range s.DictKeys { + sort.Strings(dict) + } + + s.processDocuments() + + storedIndexOffset, err := s.writeStoredFields() + if err != nil { + return 0, 0, 0, nil, err + } + + var fdvIndexOffset uint64 + var dictOffsets []uint64 + + if len(s.results) > 0 { + fdvIndexOffset, dictOffsets, err = s.writeDicts() + if err != nil { + return 0, 0, 0, nil, err + } + } else { + dictOffsets = make([]uint64, len(s.FieldsInv)) + } + + fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) + if err != nil { + return 0, 0, 0, nil, err + } + + return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil +} + +func (s *interim) getOrDefineField(fieldName string) int { + fieldIDPlus1, exists := s.FieldsMap[fieldName] + if !exists { + fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) + s.FieldsMap[fieldName] = fieldIDPlus1 + s.FieldsInv = append(s.FieldsInv, fieldName) + + s.Dicts = append(s.Dicts, make(map[string]uint64)) + + n := len(s.DictKeys) + if n < cap(s.DictKeys) { + s.DictKeys = s.DictKeys[:n+1] + s.DictKeys[n] = s.DictKeys[n][:0] + } else { + s.DictKeys = append(s.DictKeys, []string(nil)) + } + } + + return int(fieldIDPlus1 - 1) +} + +// fill Dicts and DictKeys from analysis results +func (s *interim) prepareDicts() { + var pidNext int + + var totTFs int + var totLocs int + + visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { + dict := s.Dicts[fieldID] + dictKeys := s.DictKeys[fieldID] + + for term, tf := range tfs { + pidPlus1, exists := dict[term] + if !exists { + pidNext++ + pidPlus1 = uint64(pidNext) + + dict[term] = pidPlus1 + dictKeys = append(dictKeys, term) + + s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) + s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) + } + + pid := pidPlus1 - 1 + + s.numTermsPerPostingsList[pid] += 1 + s.numLocsPerPostingsList[pid] += len(tf.Locations) + + totLocs += len(tf.Locations) + } + + totTFs += len(tfs) + + s.DictKeys[fieldID] = dictKeys + } + + for _, result := range s.results { + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + _, tf := field.Analyze() + visitField(fieldID, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + tf := result.Analyzed[i] + visitField(fieldID, tf) + } + } + + numPostingsLists := pidNext + + if cap(s.Postings) >= numPostingsLists { + s.Postings = s.Postings[:numPostingsLists] + } else { + postings := make([]*roaring.Bitmap, numPostingsLists) + copy(postings, s.Postings[:cap(s.Postings)]) + for i := 0; i < numPostingsLists; i++ { + if postings[i] == nil { + postings[i] = roaring.New() + } + } + s.Postings = postings + } + + if cap(s.FreqNorms) >= numPostingsLists { + s.FreqNorms = s.FreqNorms[:numPostingsLists] + } else { + s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + } + + if cap(s.freqNormsBacking) >= totTFs { + s.freqNormsBacking = s.freqNormsBacking[:totTFs] + } else { + s.freqNormsBacking = make([]interimFreqNorm, totTFs) + } + + freqNormsBacking := s.freqNormsBacking + for pid, numTerms := range s.numTermsPerPostingsList { + s.FreqNorms[pid] = freqNormsBacking[0:0] + freqNormsBacking = freqNormsBacking[numTerms:] + } + + if cap(s.Locs) >= numPostingsLists { + s.Locs = s.Locs[:numPostingsLists] + } else { + s.Locs = make([][]interimLoc, numPostingsLists) + } + + if cap(s.locsBacking) >= totLocs { + s.locsBacking = s.locsBacking[:totLocs] + } else { + s.locsBacking = make([]interimLoc, totLocs) + } + + locsBacking := s.locsBacking + for pid, numLocs := range s.numLocsPerPostingsList { + s.Locs[pid] = locsBacking[0:0] + locsBacking = locsBacking[numLocs:] + } +} + +func (s *interim) processDocuments() { + numFields := len(s.FieldsInv) + reuseFieldLens := make([]int, numFields) + reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) + + for docNum, result := range s.results { + for i := 0; i < numFields; i++ { // clear these for reuse + reuseFieldLens[i] = 0 + reuseFieldTFs[i] = nil + } + + s.processDocument(uint64(docNum), result, + reuseFieldLens, reuseFieldTFs) + } +} + +func (s *interim) processDocument(docNum uint64, + result *index.AnalysisResult, + fieldLens []int, fieldTFs []analysis.TokenFrequencies) { + visitField := func(fieldID uint16, fieldName string, + ln int, tf analysis.TokenFrequencies) { + fieldLens[fieldID] += ln + + existingFreqs := fieldTFs[fieldID] + if existingFreqs != nil { + existingFreqs.MergeAll(fieldName, tf) + } else { + fieldTFs[fieldID] = tf + } + } + + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + ln, tf := field.Analyze() + visitField(fieldID, field.Name(), ln, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + ln := result.Length[i] + tf := result.Analyzed[i] + visitField(fieldID, field.Name(), ln, tf) + } + + // now that it's been rolled up into fieldTFs, walk that + for fieldID, tfs := range fieldTFs { + dict := s.Dicts[fieldID] + norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) + + for term, tf := range tfs { + pid := dict[term] - 1 + bs := s.Postings[pid] + bs.Add(uint32(docNum)) + + s.FreqNorms[pid] = append(s.FreqNorms[pid], + interimFreqNorm{ + freq: uint64(tf.Frequency()), + norm: norm, + numLocs: len(tf.Locations), + }) + + if len(tf.Locations) > 0 { + locs := s.Locs[pid] + + for _, loc := range tf.Locations { + var locf = uint16(fieldID) + if loc.Field != "" { + locf = uint16(s.getOrDefineField(loc.Field)) + } + var arrayposs []uint64 + if len(loc.ArrayPositions) > 0 { + arrayposs = loc.ArrayPositions + } + locs = append(locs, interimLoc{ + fieldID: locf, + pos: uint64(loc.Position), + start: uint64(loc.Start), + end: uint64(loc.End), + arrayposs: arrayposs, + }) + } + + s.Locs[pid] = locs + } + } + } +} + +func (s *interim) writeStoredFields() ( + storedIndexOffset uint64, err error) { + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return s.metaBuf.Write(varBuf[:wb]) + } + + data, compressed := s.tmp0[:0], s.tmp1[:0] + defer func() { s.tmp0, s.tmp1 = data, compressed }() + + // keyed by docNum + docStoredOffsets := make([]uint64, len(s.results)) + + // keyed by fieldID, for the current doc in the loop + docStoredFields := map[uint16]interimStoredField{} + + for docNum, result := range s.results { + for fieldID := range docStoredFields { // reset for next doc + delete(docStoredFields, fieldID) + } + + for _, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + + opts := field.Options() + + if opts.IsStored() { + isf := docStoredFields[fieldID] + isf.vals = append(isf.vals, field.Value()) + isf.typs = append(isf.typs, encodeFieldType(field)) + isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) + docStoredFields[fieldID] = isf + } + + if opts.IncludeDocValues() { + s.IncludeDocValues[fieldID] = true + } + } + + var curr int + + s.metaBuf.Reset() + data = data[:0] + + // _id field special case optimizes ExternalID() lookups + idFieldVal := docStoredFields[uint16(0)].vals[0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, err + } + + // handle non-"_id" fields + for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { + isf, exists := docStoredFields[uint16(fieldID)] + if exists { + curr, data, err = persistStoredFieldValues( + fieldID, isf.vals, isf.typs, isf.arrayposs, + curr, metaEncode, data) + if err != nil { + return 0, err + } + } + } + + metaBytes := s.metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + + docStoredOffsets[docNum] = uint64(s.w.Count()) + + _, err := writeUvarints(s.w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) + if err != nil { + return 0, err + } + + _, err = s.w.Write(metaBytes) + if err != nil { + return 0, err + } + + _, err = s.w.Write(idFieldVal) + if err != nil { + return 0, err + } + + _, err = s.w.Write(compressed) + if err != nil { + return 0, err + } + } + + storedIndexOffset = uint64(s.w.Count()) + + for _, docStoredOffset := range docStoredOffsets { + err = binary.Write(s.w, binary.BigEndian, docStoredOffset) + if err != nil { + return 0, err + } + } + + return storedIndexOffset, nil +} + +func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { + dictOffsets = make([]uint64, len(s.FieldsInv)) + + fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) + fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) + + buf := s.grabBuf(binary.MaxVarintLen64) + + tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) + locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) + fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false) + + var docTermMap [][]byte + + if s.builder == nil { + s.builder, err = vellum.New(&s.builderBuf, nil) + if err != nil { + return 0, nil, err + } + } + + for fieldID, terms := range s.DictKeys { + if cap(docTermMap) < len(s.results) { + docTermMap = make([][]byte, len(s.results)) + } else { + docTermMap = docTermMap[0:len(s.results)] + for docNum := range docTermMap { // reset the docTermMap + docTermMap[docNum] = docTermMap[docNum][:0] + } + } + + dict := s.Dicts[fieldID] + + for _, term := range terms { // terms are already sorted + pid := dict[term] - 1 + + postingsBS := s.Postings[pid] + + freqNorms := s.FreqNorms[pid] + freqNormOffset := 0 + + locs := s.Locs[pid] + locOffset := 0 + + postingsItr := postingsBS.Iterator() + for postingsItr.HasNext() { + docNum := uint64(postingsItr.Next()) + + freqNorm := freqNorms[freqNormOffset] + + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), + uint64(math.Float32bits(freqNorm.norm))) + if err != nil { + return 0, nil, err + } + + if freqNorm.numLocs > 0 { + numBytesLocs := 0 + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + numBytesLocs += totalUvarintBytes( + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs)), loc.arrayposs) + } + + err = locEncoder.Add(docNum, uint64(numBytesLocs)) + if err != nil { + return 0, nil, err + } + + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs))) + if err != nil { + return 0, nil, err + } + + err = locEncoder.Add(docNum, loc.arrayposs...) + if err != nil { + return 0, nil, err + } + } + + locOffset += freqNorm.numLocs + } + + freqNormOffset++ + + docTermMap[docNum] = append( + append(docTermMap[docNum], term...), + termSeparator) + } + + tfEncoder.Close() + locEncoder.Close() + + postingsOffset, err := + writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) + if err != nil { + return 0, nil, err + } + + if postingsOffset > uint64(0) { + err = s.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return 0, nil, err + } + } + + tfEncoder.Reset() + locEncoder.Reset() + } + + err = s.builder.Close() + if err != nil { + return 0, nil, err + } + + // record where this dictionary starts + dictOffsets[fieldID] = uint64(s.w.Count()) + + vellumData := s.builderBuf.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + + // write this vellum to disk + _, err = s.w.Write(vellumData) + if err != nil { + return 0, nil, err + } + + // reset vellum for reuse + s.builderBuf.Reset() + + err = s.builder.Reset(&s.builderBuf) + if err != nil { + return 0, nil, err + } + + // write the field doc values + if s.IncludeDocValues[fieldID] { + for docNum, docTerms := range docTermMap { + if len(docTerms) > 0 { + err = fdvEncoder.Add(uint64(docNum), docTerms) + if err != nil { + return 0, nil, err + } + } + } + err = fdvEncoder.Close() + if err != nil { + return 0, nil, err + } + + fdvOffsetsStart[fieldID] = uint64(s.w.Count()) + + _, err = fdvEncoder.Write() + if err != nil { + return 0, nil, err + } + + fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) + + fdvEncoder.Reset() + } else { + fdvOffsetsStart[fieldID] = fieldNotUninverted + fdvOffsetsEnd[fieldID] = fieldNotUninverted + } + } + + fdvIndexOffset = uint64(s.w.Count()) + + for i := 0; i < len(fdvOffsetsStart); i++ { + n := binary.PutUvarint(buf, fdvOffsetsStart[i]) + _, err := s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + } + + return fdvIndexOffset, dictOffsets, nil +} + +func encodeFieldType(f document.Field) byte { + fieldType := byte('x') + switch f.(type) { + case *document.TextField: + fieldType = 't' + case *document.NumericField: + fieldType = 'n' + case *document.DateTimeField: + fieldType = 'd' + case *document.BooleanField: + fieldType = 'b' + case *document.GeoPointField: + fieldType = 'g' + case *document.CompositeField: + fieldType = 'c' + } + return fieldType +} + +// returns the total # of bytes needed to encode the given uint64's +// into binary.PutUVarint() encoding +func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { + n = numUvarintBytes(a) + n += numUvarintBytes(b) + n += numUvarintBytes(c) + n += numUvarintBytes(d) + n += numUvarintBytes(e) + for _, v := range more { + n += numUvarintBytes(v) + } + return n +} + +// returns # of bytes needed to encode x in binary.PutUvarint() encoding +func numUvarintBytes(x uint64) (n int) { + for x >= 0x80 { + x >>= 7 + n++ + } + return n + 1 +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/posting.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/posting.go index d504885d0..26378c27e 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/posting.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/posting.go @@ -18,71 +18,245 @@ import ( "bytes" "encoding/binary" "fmt" + "io" "math" + "reflect" "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) -// PostingsList is an in-memory represenation of a postings list +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + +// FST or vellum value (uint64) encoding is determined by the top two +// highest-order or most significant bits... +// +// encoding : MSB +// name : 63 62 61...to...bit #0 (LSB) +// ----------+---+---+--------------------------------------------------- +// general : 0 | 0 | 62-bits of postingsOffset. +// ~ : 0 | 1 | reserved for future. +// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum. +// ~ : 1 | 1 | reserved for future. +// +// Encoding "general" is able to handle all cases, where the +// postingsOffset points to more information about the postings for +// the term. +// +// Encoding "1-hit" is used to optimize a commonly seen case when a +// term has only a single hit. For example, a term in the _id field +// will have only 1 hit. The "1-hit" encoding is used for a term +// in a field when... +// +// - term vector info is disabled for that field; +// - and, the term appears in only a single doc for that field; +// - and, the term's freq is exactly 1 in that single doc for that field; +// - and, the docNum must fit into 31-bits; +// +// Otherwise, the "general" encoding is used instead. +// +// In the "1-hit" encoding, the field in that single doc may have +// other terms, which is supported in the "1-hit" encoding by the +// positive float31 norm. + +const FSTValEncodingMask = uint64(0xc000000000000000) +const FSTValEncodingGeneral = uint64(0x0000000000000000) +const FSTValEncoding1Hit = uint64(0x8000000000000000) + +func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 { + return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum) +} + +func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) { + return (mask31Bits & v), (mask31Bits & (v >> 31)) +} + +const mask31Bits = uint64(0x000000007fffffff) + +func under32Bits(x uint64) bool { + return x <= mask31Bits +} + +const DocNum1HitFinished = math.MaxUint64 + +var NormBits1Hit = uint64(math.Float32bits(float32(1))) + +// PostingsList is an in-memory representation of a postings list type PostingsList struct { sb *SegmentBase postingsOffset uint64 freqOffset uint64 locOffset uint64 - locBitmap *roaring.Bitmap postings *roaring.Bitmap except *roaring.Bitmap + + // when normBits1Hit != 0, then this postings list came from a + // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply + docNum1Hit uint64 + normBits1Hit uint64 +} + +// represents an immutable, empty postings list +var emptyPostingsList = &PostingsList{} + +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { + if p.normBits1Hit != 0 { + receiver.Add(uint32(p.docNum1Hit)) + return + } + + if p.postings != nil { + receiver.Or(p.postings) + } } // Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator() segment.PostingsIterator { - return p.iterator(nil) +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, + prealloc segment.PostingsIterator) segment.PostingsIterator { + if p.normBits1Hit == 0 && p.postings == nil { + return emptyPostingsIterator + } + + var preallocPI *PostingsIterator + pi, ok := prealloc.(*PostingsIterator) + if ok && pi != nil { + preallocPI = pi + } + if preallocPI == emptyPostingsIterator { + preallocPI = nil + } + + return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI) } -func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { +func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, + rv *PostingsIterator) *PostingsIterator { if rv == nil { rv = &PostingsIterator{} } else { + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.Reset([]byte(nil)) + } + + locReader := rv.locReader + if locReader != nil { + locReader.Reset([]byte(nil)) + } + + freqChunkOffsets := rv.freqChunkOffsets[:0] + locChunkOffsets := rv.locChunkOffsets[:0] + + nextLocs := rv.nextLocs[:0] + nextSegmentLocs := rv.nextSegmentLocs[:0] + + buf := rv.buf + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.locReader = locReader + + rv.freqChunkOffsets = freqChunkOffsets + rv.locChunkOffsets = locChunkOffsets + + rv.nextLocs = nextLocs + rv.nextSegmentLocs = nextSegmentLocs + + rv.buf = buf } + rv.postings = p + rv.includeFreqNorm = includeFreq || includeNorm + rv.includeLocs = includeLocs - if p.postings != nil { - // prepare the freq chunk details - var n uint64 - var read int + if p.normBits1Hit != 0 { + // "1-hit" encoding + rv.docNum1Hit = p.docNum1Hit + rv.normBits1Hit = p.normBits1Hit + + if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { + rv.docNum1Hit = DocNum1HitFinished + } + + return rv + } + + // "general" encoding, check if empty + if p.postings == nil { + return rv + } + + var n uint64 + var read int + + // prepare the freq chunk details + if rv.includeFreqNorm { var numFreqChunks uint64 numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.freqChunkLens = make([]uint64, int(numFreqChunks)) + if cap(rv.freqChunkOffsets) >= int(numFreqChunks) { + rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)] + } else { + rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) + } for i := 0; i < int(numFreqChunks); i++ { - rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.freqChunkStart = p.freqOffset + n + } - // prepare the loc chunk details + // prepare the loc chunk details + if rv.includeLocs { n = 0 var numLocChunks uint64 numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.locChunkLens = make([]uint64, int(numLocChunks)) + if cap(rv.locChunkOffsets) >= int(numLocChunks) { + rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)] + } else { + rv.locChunkOffsets = make([]uint64, int(numLocChunks)) + } for i := 0; i < int(numLocChunks); i++ { - rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.locChunkStart = p.locOffset + n - rv.locBitmap = p.locBitmap + } - rv.all = p.postings.Iterator() - if p.except != nil { - allExcept := roaring.AndNot(p.postings, p.except) - rv.actual = allExcept.Iterator() - } else { - rv.actual = p.postings.Iterator() - } + rv.all = p.postings.Iterator() + if p.except != nil { + rv.ActualBM = roaring.AndNot(p.postings, p.except) + rv.Actual = rv.ActualBM.Iterator() + } else { + rv.ActualBM = p.postings + rv.Actual = rv.all // Optimize to use same iterator for all & Actual. } return rv @@ -90,23 +264,30 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { // Count returns the number of items on this postings list func (p *PostingsList) Count() uint64 { - if p.postings != nil { - n := p.postings.GetCardinality() - if p.except != nil { - e := p.except.GetCardinality() - if e > n { - e = n - } - return n - e - } - return n + var n uint64 + if p.normBits1Hit != 0 { + n = 1 + } else if p.postings != nil { + n = p.postings.GetCardinality() + } + var e uint64 + if p.except != nil { + e = p.except.GetCardinality() } - return 0 + if n <= e { + return 0 + } + return n - e } func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.postingsOffset = postingsOffset + // handle "1-hit" encoding special case + if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit { + return rv.init1Hit(postingsOffset) + } + // read the location of the freq/norm details var n uint64 var read int @@ -117,29 +298,16 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) - var locBitmapOffset uint64 - locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - var locBitmapLen uint64 - locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) - - locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] - - rv.locBitmap = roaring.NewBitmap() - _, err := rv.locBitmap.FromBuffer(locRoaringBytes) - if err != nil { - return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) - } - var postingsLen uint64 postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] - rv.postings = roaring.NewBitmap() - _, err = rv.postings.FromBuffer(roaringBytes) + if rv.postings == nil { + rv.postings = roaring.NewBitmap() + } + _, err := rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) } @@ -147,65 +315,137 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { return nil } +func (rv *PostingsList) init1Hit(fstVal uint64) error { + docNum, normBits := FSTValDecode1Hit(fstVal) + + rv.docNum1Hit = docNum + rv.normBits1Hit = normBits + + return nil +} + // PostingsIterator provides a way to iterate through the postings list type PostingsIterator struct { - postings *PostingsList - all roaring.IntIterable - offset int - locoffset int - actual roaring.IntIterable + postings *PostingsList + all roaring.IntIterable + Actual roaring.IntIterable + ActualBM *roaring.Bitmap currChunk uint32 currChunkFreqNorm []byte currChunkLoc []byte - freqNormDecoder *govarint.Base128Decoder - locDecoder *govarint.Base128Decoder - freqChunkLens []uint64 - freqChunkStart uint64 + freqNormReader *bytes.Reader + locReader *bytes.Reader + + freqChunkOffsets []uint64 + freqChunkStart uint64 + + locChunkOffsets []uint64 + locChunkStart uint64 - locChunkLens []uint64 - locChunkStart uint64 + next Posting // reused across Next() calls + nextLocs []Location // reused across Next() calls + nextSegmentLocs []segment.Location // reused across Next() calls - locBitmap *roaring.Bitmap + docNum1Hit uint64 + normBits1Hit uint64 - next Posting + buf []byte + + includeFreqNorm bool + includeLocs bool } -func (i *PostingsIterator) loadChunk(chunk int) error { - if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) { - return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens)) +var emptyPostingsIterator = &PostingsIterator{} + +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + + len(i.currChunkFreqNorm) + + len(i.currChunkLoc) + + len(i.freqChunkOffsets)*size.SizeOfUint64 + + len(i.locChunkOffsets)*size.SizeOfUint64 + + i.next.Size() + + for _, entry := range i.nextLocs { + sizeInBytes += entry.Size() } - // load correct chunk bytes - start := i.freqChunkStart - for j := 0; j < chunk; j++ { - start += i.freqChunkLens[j] + + return sizeInBytes +} + +func (i *PostingsIterator) loadChunk(chunk int) error { + if i.includeFreqNorm { + if chunk >= len(i.freqChunkOffsets) { + return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)", + chunk, len(i.freqChunkOffsets)) + } + + end, start := i.freqChunkStart, i.freqChunkStart + s, e := readChunkBoundary(chunk, i.freqChunkOffsets) + start += s + end += e + i.currChunkFreqNorm = i.postings.sb.mem[start:end] + if i.freqNormReader == nil { + i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm) + } else { + i.freqNormReader.Reset(i.currChunkFreqNorm) + } } - end := start + i.freqChunkLens[chunk] - i.currChunkFreqNorm = i.postings.sb.mem[start:end] - i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm)) - start = i.locChunkStart - for j := 0; j < chunk; j++ { - start += i.locChunkLens[j] + if i.includeLocs { + if chunk >= len(i.locChunkOffsets) { + return fmt.Errorf("tried to load loc chunk that doesn't exist %d/(%d)", + chunk, len(i.locChunkOffsets)) + } + + end, start := i.locChunkStart, i.locChunkStart + s, e := readChunkBoundary(chunk, i.locChunkOffsets) + start += s + end += e + i.currChunkLoc = i.postings.sb.mem[start:end] + if i.locReader == nil { + i.locReader = bytes.NewReader(i.currChunkLoc) + } else { + i.locReader.Reset(i.currChunkLoc) + } } - end = start + i.locChunkLens[chunk] - i.currChunkLoc = i.postings.sb.mem[start:end] - i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc)) + i.currChunk = uint32(chunk) return nil } -func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { - freq, err := i.freqNormDecoder.GetU64() +func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { + if i.normBits1Hit != 0 { + return 1, i.normBits1Hit, false, nil + } + + freqHasLocs, err := binary.ReadUvarint(i.freqNormReader) if err != nil { - return 0, 0, fmt.Errorf("error reading frequency: %v", err) + return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) } - normBits, err := i.freqNormDecoder.GetU64() + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + + normBits, err := binary.ReadUvarint(i.freqNormReader) if err != nil { - return 0, 0, fmt.Errorf("error reading norm: %v", err) + return 0, 0, false, fmt.Errorf("error reading norm: %v", err) } - return freq, normBits, err + + return freq, normBits, hasLocs, err +} + +func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { + rv := freq << 1 + if hasLocs { + rv = rv | 0x01 // 0'th LSB encodes whether there are locations + } + return rv +} + +func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { + freq := freqHasLocs >> 1 + hasLocs := freqHasLocs&0x01 != 0 + return freq, hasLocs } // readLocation processes all the integers on the stream representing a single @@ -214,27 +454,27 @@ func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { // the contents. func (i *PostingsIterator) readLocation(l *Location) error { // read off field - fieldID, err := i.locDecoder.GetU64() + fieldID, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location field: %v", err) } // read off pos - pos, err := i.locDecoder.GetU64() + pos, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location pos: %v", err) } // read off start - start, err := i.locDecoder.GetU64() + start, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location start: %v", err) } // read off end - end, err := i.locDecoder.GetU64() + end, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location end: %v", err) } // read off num array pos - numArrayPos, err := i.locDecoder.GetU64() + numArrayPos, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location num array pos: %v", err) } @@ -245,14 +485,16 @@ func (i *PostingsIterator) readLocation(l *Location) error { l.pos = pos l.start = start l.end = end - if numArrayPos > 0 { + if cap(l.ap) < int(numArrayPos) { l.ap = make([]uint64, int(numArrayPos)) + } else { + l.ap = l.ap[:int(numArrayPos)] } } // read off array positions for k := 0; k < int(numArrayPos); k++ { - ap, err := i.locDecoder.GetU64() + ap, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading array position: %v", err) } @@ -266,97 +508,332 @@ func (i *PostingsIterator) readLocation(l *Location) error { // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { - if i.actual == nil || !i.actual.HasNext() { - return nil, nil + return i.nextAtOrAfter(0) +} + +// Advance returns the posting at the specified docNum or it is not present +// the next posting, or if the end is reached, nil +func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) { + return i.nextAtOrAfter(docNum) +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) { + docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter) + if err != nil || !exists { + return nil, err } - n := i.actual.Next() - nChunk := n / i.postings.sb.chunkFactor - allN := i.all.Next() - allNChunk := allN / i.postings.sb.chunkFactor - // n is the next actual hit (excluding some postings) - // allN is the next hit in the full postings - // if they don't match, adjust offsets to factor in item we're skipping over - // incr the all iterator, and check again - for allN != n { + i.next = Posting{} // clear the struct + rv := &i.next + rv.docNum = docNum + + if !i.includeFreqNorm { + return rv, nil + } + + var normBits uint64 + var hasLocs bool + + rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + if err != nil { + return nil, err + } + + rv.norm = math.Float32frombits(uint32(normBits)) - // in different chunks, reset offsets - if allNChunk != nChunk { - i.locoffset = 0 - i.offset = 0 + if i.includeLocs && hasLocs { + // prepare locations into reused slices, where we assume + // rv.freq >= "number of locs", since in a composite field, + // some component fields might have their IncludeTermVector + // flags disabled while other component fields are enabled + if cap(i.nextLocs) >= int(rv.freq) { + i.nextLocs = i.nextLocs[0:rv.freq] } else { + i.nextLocs = make([]Location, rv.freq, rv.freq*2) + } + if cap(i.nextSegmentLocs) < int(rv.freq) { + i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2) + } + rv.locs = i.nextSegmentLocs[:0] - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return nil, fmt.Errorf("error loading chunk: %v", err) - } - } + numLocsBytes, err := binary.ReadUvarint(i.locReader) + if err != nil { + return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) + } - // read off freq/offsets even though we don't care about them - freq, _, err := i.readFreqNorm() + j := 0 + startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader + for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) { + err := i.readLocation(&i.nextLocs[j]) if err != nil { return nil, err } - if i.locBitmap.Contains(allN) { - for j := 0; j < int(freq); j++ { - err := i.readLocation(nil) - if err != nil { - return nil, err - } - } - } + rv.locs = append(rv.locs, &i.nextLocs[j]) + j++ + } + } + + return rv, nil +} + +var freqHasLocs1Hit = encodeFreqHasLocs(1, false) + +// nextBytes returns the docNum and the encoded freq & loc bytes for +// the next posting +func (i *PostingsIterator) nextBytes() ( + docNumOut uint64, freq uint64, normBits uint64, + bytesFreqNorm []byte, bytesLoc []byte, err error) { + docNum, exists, err := i.nextDocNumAtOrAfter(0) + if err != nil || !exists { + return 0, 0, 0, nil, nil, err + } + + if i.normBits1Hit != 0 { + if i.buf == nil { + i.buf = make([]byte, binary.MaxVarintLen64*2) + } + n := binary.PutUvarint(i.buf, freqHasLocs1Hit) + n += binary.PutUvarint(i.buf[n:], i.normBits1Hit) + return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil + } + + startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() + + var hasLocs bool + + freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + if err != nil { + return 0, 0, 0, nil, nil, err + } + + endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() + bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] + + if hasLocs { + startLoc := len(i.currChunkLoc) - i.locReader.Len() + + numLocsBytes, err := binary.ReadUvarint(i.locReader) + if err != nil { + return 0, 0, 0, nil, nil, + fmt.Errorf("error reading location nextBytes numLocs: %v", err) + } + + // skip over all the location bytes + _, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent) + if err != nil { + return 0, 0, 0, nil, nil, err + } + + endLoc := len(i.currChunkLoc) - i.locReader.Len() + bytesLoc = i.currChunkLoc[startLoc:endLoc] + } + + return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil +} + +// nextDocNum returns the next docNum on the postings list, and also +// sets up the currChunk / loc related fields of the iterator. +func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) { + if i.normBits1Hit != 0 { + if i.docNum1Hit == DocNum1HitFinished { + return 0, false, nil + } + if i.docNum1Hit < atOrAfter { + // advanced past our 1-hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return 0, false, nil + } + docNum := i.docNum1Hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return docNum, true, nil + } + + if i.Actual == nil || !i.Actual.HasNext() { + return 0, false, nil + } + + if i.postings == nil || i.postings.postings == i.ActualBM { + return i.nextDocNumAtOrAfterClean(atOrAfter) + } + + n := i.Actual.Next() + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + } + if uint64(n) < atOrAfter { + // couldn't find anything + return 0, false, nil + } + allN := i.all.Next() + + nChunk := n / i.postings.sb.chunkFactor - // in same chunk, need to account for offsets - i.offset++ + // when allN becomes >= to here, then allN is in the same chunk as nChunk. + allNReachesNChunk := nChunk * i.postings.sb.chunkFactor + + // n is the next actual hit (excluding some postings), and + // allN is the next hit in the full postings, and + // if they don't match, move 'all' forwards until they do + for allN != n { + // we've reached same chunk, so move the freq/norm/loc decoders forward + if i.includeFreqNorm && allN >= allNReachesNChunk { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, err + } } allN = i.all.Next() } + if i.includeFreqNorm && (i.currChunk != nChunk || i.currChunkFreqNorm == nil) { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +// optimization when the postings list is "clean" (e.g., no updates & +// no deletions) where the all bitmap is the same as the actual bitmap +func (i *PostingsIterator) nextDocNumAtOrAfterClean( + atOrAfter uint64) (uint64, bool, error) { + n := i.Actual.Next() + + if !i.includeFreqNorm { + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + } + + if uint64(n) < atOrAfter { + return 0, false, nil // couldn't find anything + } + + return uint64(n), true, nil + } + + // freq-norm's needed, so maintain freq-norm chunk reader + sameChunkNexts := 0 // # of times we called Next() in the same chunk + + nChunk := n / i.postings.sb.chunkFactor + + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + + nChunkPrev := nChunk + nChunk = n / i.postings.sb.chunkFactor + + if nChunk != nChunkPrev { + sameChunkNexts = 0 + } else { + sameChunkNexts += 1 + } + } + + if uint64(n) < atOrAfter { + // couldn't find anything + return 0, false, nil + } + + for j := 0; j < sameChunkNexts; j++ { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) + } + } + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { err := i.loadChunk(int(nChunk)) if err != nil { - return nil, fmt.Errorf("error loading chunk: %v", err) + return 0, false, fmt.Errorf("error loading chunk: %v", err) } } - i.next = Posting{} // clear the struct. - rv := &i.next - rv.iterator = i - rv.docNum = uint64(n) + return uint64(n), true, nil +} - var err error - var normBits uint64 - rv.freq, normBits, err = i.readFreqNorm() +func (i *PostingsIterator) currChunkNext(nChunk uint32) error { + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + _, _, hasLocs, err := i.readFreqNormHasLocs() if err != nil { - return nil, err + return err } - rv.norm = math.Float32frombits(uint32(normBits)) - if i.locBitmap.Contains(n) { - // read off 'freq' locations - rv.locs = make([]segment.Location, rv.freq) - locs := make([]Location, rv.freq) - for j := 0; j < int(rv.freq); j++ { - err := i.readLocation(&locs[j]) - if err != nil { - return nil, err - } - rv.locs[j] = &locs[j] + + if i.includeLocs && hasLocs { + numLocsBytes, err := binary.ReadUvarint(i.locReader) + if err != nil { + return fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + // skip over all the location bytes + _, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent) + if err != nil { + return err } } - return rv, nil + return nil +} + +// DocNum1Hit returns the docNum and true if this is "1-hit" optimized +// and the docNum is available. +func (p *PostingsIterator) DocNum1Hit() (uint64, bool) { + if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished { + return p.docNum1Hit, true + } + return 0, false +} + +// PostingsIteratorFromBitmap constructs a PostingsIterator given an +// "actual" bitmap. +func PostingsIteratorFromBitmap(bm *roaring.Bitmap, + includeFreqNorm, includeLocs bool) (*PostingsIterator, error) { + return &PostingsIterator{ + ActualBM: bm, + Actual: bm.Iterator(), + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// PostingsIteratorFrom1Hit constructs a PostingsIterator given a +// 1-hit docNum. +func PostingsIteratorFrom1Hit(docNum1Hit, normBits1Hit uint64, + includeFreqNorm, includeLocs bool) (*PostingsIterator, error) { + return &PostingsIterator{ + docNum1Hit: docNum1Hit, + normBits1Hit: normBits1Hit, + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil } // Posting is a single entry in a postings list type Posting struct { - iterator *PostingsIterator - docNum uint64 + docNum uint64 + freq uint64 + norm float32 + locs []segment.Location +} - freq uint64 - norm float32 - locs []segment.Location +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + + for _, entry := range p.locs { + sizeInBytes += entry.Size() + } + + return sizeInBytes } // Number returns the document number of this posting in this segment @@ -364,7 +841,7 @@ func (p *Posting) Number() uint64 { return p.docNum } -// Frequency returns the frequence of occurance of this term in this doc/field +// Frequency returns the frequencies of occurrence of this term in this doc/field func (p *Posting) Frequency() uint64 { return p.freq } @@ -374,12 +851,12 @@ func (p *Posting) Norm() float64 { return float64(p.norm) } -// Locations returns the location information for each occurance +// Locations returns the location information for each occurrence func (p *Posting) Locations() []segment.Location { return p.locs } -// Location represents the location of a single occurance +// Location represents the location of a single occurrence type Location struct { field string pos uint64 @@ -388,28 +865,34 @@ type Location struct { ap []uint64 } +func (l *Location) Size() int { + return reflectStaticSizeLocation + + len(l.field) + + len(l.ap)*size.SizeOfUint64 +} + // Field returns the name of the field (useful in composite fields to know // which original field the value came from) func (l *Location) Field() string { return l.field } -// Start returns the start byte offset of this occurance +// Start returns the start byte offset of this occurrence func (l *Location) Start() uint64 { return l.start } -// End returns the end byte offset of this occurance +// End returns the end byte offset of this occurrence func (l *Location) End() uint64 { return l.end } -// Pos returns the 1-based phrase position of this occurance +// Pos returns the 1-based phrase position of this occurrence func (l *Location) Pos() uint64 { return l.pos } -// ArrayPositions returns the array position vector associated with this occurance +// ArrayPositions returns the array position vector associated with this occurrence func (l *Location) ArrayPositions() []uint64 { return l.ap } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/segment.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/segment.go index 40c0af274..7ba28c236 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/segment.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/segment.go @@ -20,16 +20,24 @@ import ( "fmt" "io" "os" + "reflect" "sync" "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" "github.com/couchbase/vellum" mmap "github.com/edsrzf/mmap-go" "github.com/golang/snappy" ) +var reflectStaticSizeSegmentBase int + +func init() { + var sb SegmentBase + reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size()) +} + // Open returns a zap impl of a segment func Open(path string) (segment.Segment, error) { f, err := os.Open(path) @@ -47,13 +55,14 @@ func Open(path string) (segment.Segment, error) { SegmentBase: SegmentBase{ mem: mm[0 : len(mm)-FooterSize], fieldsMap: make(map[string]uint16), - fieldDvIterMap: make(map[uint16]*docValueIterator), + fieldDvReaders: make(map[uint16]*docValueReader), }, f: f, mm: mm, path: path, refs: 1, } + rv.SegmentBase.updateSize() err = rv.loadConfig() if err != nil { @@ -67,7 +76,7 @@ func Open(path string) (segment.Segment, error) { return nil, err } - err = rv.loadDvIterators() + err = rv.loadDvReaders() if err != nil { _ = rv.Close() return nil, err @@ -89,7 +98,39 @@ type SegmentBase struct { fieldsIndexOffset uint64 docValueOffset uint64 dictLocs []uint64 - fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field + fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field + fieldDvNames []string // field names cached in fieldDvReaders + size uint64 +} + +func (sb *SegmentBase) Size() int { + return int(sb.size) +} + +func (sb *SegmentBase) updateSize() { + sizeInBytes := reflectStaticSizeSegmentBase + + cap(sb.mem) + + // fieldsMap + for k, _ := range sb.fieldsMap { + sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16 + } + + // fieldsInv, dictLocs + for _, entry := range sb.fieldsInv { + sizeInBytes += len(entry) + size.SizeOfString + } + sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 + + // fieldDvReaders + for _, v := range sb.fieldDvReaders { + sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr + if v != nil { + sizeInBytes += v.size() + } + } + + sb.size = uint64(sizeInBytes) } func (sb *SegmentBase) AddRef() {} @@ -111,56 +152,19 @@ type Segment struct { refs int64 } -func (s *Segment) SizeInBytes() uint64 { +func (s *Segment) Size() int { // 8 /* size of file pointer */ // 4 /* size of version -> uint32 */ // 4 /* size of crc -> uint32 */ sizeOfUints := 16 - sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints + sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints // mutex, refs -> int64 sizeInBytes += 16 // do not include the mmap'ed part - return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem)) -} - -func (s *SegmentBase) SizeInBytes() uint64 { - // 4 /* size of memCRC -> uint32 */ - // 4 /* size of chunkFactor -> uint32 */ - // 8 /* size of numDocs -> uint64 */ - // 8 /* size of storedIndexOffset -> uint64 */ - // 8 /* size of fieldsIndexOffset -> uint64 */ - // 8 /* size of docValueOffset -> uint64 */ - sizeInBytes := 40 - - sizeInBytes += len(s.mem) + int(segment.SizeOfSlice) - - // fieldsMap - for k, _ := range s.fieldsMap { - sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */ - } - sizeInBytes += int(segment.SizeOfMap) /* overhead from map */ - - // fieldsInv, dictLocs - for _, entry := range s.fieldsInv { - sizeInBytes += (len(entry) + int(segment.SizeOfString)) - } - sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */ - sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */ - - // fieldDvIterMap - sizeInBytes += len(s.fieldDvIterMap) * - int(segment.SizeOfPointer+2 /* size of uint16 */) - for _, entry := range s.fieldDvIterMap { - if entry != nil { - sizeInBytes += int(entry.sizeInBytes()) - } - } - sizeInBytes += int(segment.SizeOfMap) - - return uint64(sizeInBytes) + return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) } func (s *Segment) AddRef() { @@ -185,7 +189,7 @@ func (s *Segment) loadConfig() error { verOffset := crcOffset - 4 s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) - if s.version != version { + if s.version != Version { return fmt.Errorf("unsupported version %d", s.version) } @@ -207,7 +211,7 @@ func (s *Segment) loadConfig() error { } func (s *SegmentBase) loadFields() error { - // NOTE for now we assume the fields index immediately preceeds + // NOTE for now we assume the fields index immediately precedes // the footer, and if this changes, need to adjust accordingly (or // store explicit length), where s.mem was sliced from s.mm in Open(). fieldsIndexEnd := uint64(len(s.mem)) @@ -262,6 +266,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { if err != nil { return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) } + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) + } } } } @@ -269,50 +277,90 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { return rv, nil } +// visitDocumentCtx holds data structures that are reusable across +// multiple VisitDocument() calls to avoid memory allocations +type visitDocumentCtx struct { + buf []byte + reader bytes.Reader + arrayPos []uint64 +} + +var visitDocumentCtxPool = sync.Pool{ + New: func() interface{} { + reuse := &visitDocumentCtx{} + return reuse + }, +} + // VisitDocument invokes the DocFieldValueVistor for each stored field // for the specified doc number func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + return s.visitDocument(vdc, num, visitor) +} + +func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, + visitor segment.DocumentFieldValueVisitor) error { // first make sure this is a valid number in this segment if num < s.numDocs { meta, compressed := s.getDocStoredMetaAndCompressed(num) - uncompressed, err := snappy.Decode(nil, compressed) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + idFieldVal := compressed[:idFieldValLen] + + keepGoing := visitor("_id", byte('t'), idFieldVal, nil) + if !keepGoing { + visitDocumentCtxPool.Put(vdc) + return nil + } + + // handle non-"_id" fields + compressed = compressed[idFieldValLen:] + + uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) if err != nil { return err } - // now decode meta and process - reader := bytes.NewReader(meta) - decoder := govarint.NewU64Base128Decoder(reader) - keepGoing := true for keepGoing { - field, err := decoder.GetU64() + field, err := binary.ReadUvarint(&vdc.reader) if err == io.EOF { break } if err != nil { return err } - typ, err := decoder.GetU64() + typ, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } - offset, err := decoder.GetU64() + offset, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } - l, err := decoder.GetU64() + l, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } - numap, err := decoder.GetU64() + numap, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } var arrayPos []uint64 if numap > 0 { - arrayPos = make([]uint64, numap) + if cap(vdc.arrayPos) < int(numap) { + vdc.arrayPos = make([]uint64, numap) + } + arrayPos = vdc.arrayPos[:numap] for i := 0; i < int(numap); i++ { - ap, err := decoder.GetU64() + ap, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } @@ -323,10 +371,36 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal value := uncompressed[offset : offset+l] keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) } + + vdc.buf = uncompressed } return nil } +// DocID returns the value of the _id field for the given docNum +func (s *SegmentBase) DocID(num uint64) ([]byte, error) { + if num >= s.numDocs { + return nil, nil + } + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return nil, err + } + idFieldVal := compressed[:idFieldValLen] + + visitDocumentCtxPool.Put(vdc) + + return idFieldVal, nil +} + // Count returns the number of documents in this segment. func (s *SegmentBase) Count() uint64 { return s.numDocs @@ -343,15 +417,26 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { return nil, err } - var postings *PostingsList + postingsList := emptyPostingsList + + sMax, err := idDict.fst.GetMaxKey() + if err != nil { + return nil, err + } + sMaxStr := string(sMax) + filteredIds := make([]string, 0, len(ids)) for _, id := range ids { - postings, err = idDict.postingsList([]byte(id), nil, postings) + if id <= sMaxStr { + filteredIds = append(filteredIds, id) + } + } + + for _, id := range filteredIds { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) if err != nil { return nil, err } - if postings.postings != nil { - rv.Or(postings.postings) - } + postingsList.OrInto(rv) } } @@ -441,19 +526,32 @@ func (s *Segment) DictAddr(field string) (uint64, error) { return s.dictLocs[fieldIDPlus1-1], nil } -func (s *SegmentBase) loadDvIterators() error { +func (s *SegmentBase) loadDvReaders() error { if s.docValueOffset == fieldNotUninverted { return nil } var read uint64 for fieldID, field := range s.fieldsInv { - fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + var fieldLocStart, fieldLocEnd uint64 + var n int + fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) if n <= 0 { - return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) } - s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc) read += uint64(n) + fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) + } + read += uint64(n) + + fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) + if fieldDvReader != nil { + s.fieldDvReaders[uint16(fieldID)] = fieldDvReader + s.fieldDvNames = append(s.fieldDvNames, field) + } } + return nil } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/write.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/write.go index c5316a99f..cddaedd00 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/write.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/write.go @@ -15,7 +15,6 @@ package zap import ( - "bytes" "encoding/binary" "io" @@ -25,28 +24,29 @@ import ( // writes out the length of the roaring bitmap in bytes as varint // then writes out the roaring bitmap itself func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, - reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) { - reuseBuf.Reset() - - // write out postings list to memory so we know the len - postingsListLen, err := r.WriteTo(reuseBuf) + reuseBufVarint []byte) (int, error) { + buf, err := r.ToBytes() if err != nil { return 0, err } + var tw int - // write out the length of this postings list - n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen)) + + // write out the length + n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) nw, err := w.Write(reuseBufVarint[:n]) tw += nw if err != nil { return tw, err } - // write out the postings list itself - nw, err = w.Write(reuseBuf.Bytes()) + + // write out the roaring bytes + nw, err = w.Write(buf) tw += nw if err != nil { return tw, err } + return tw, nil } @@ -118,7 +118,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset return err } // write out 32-bit version - err = binary.Write(w, binary.BigEndian, version) + err = binary.Write(w, binary.BigEndian, Version) if err != nil { return err } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go index bb9975768..8babb31fa 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go @@ -15,10 +15,10 @@ package scorch import ( - "bytes" "container/heap" "encoding/binary" "fmt" + "reflect" "sort" "sync" "sync/atomic" @@ -27,8 +27,13 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbase/vellum" + lev2 "github.com/couchbase/vellum/levenshtein2" ) +// re usable, threadsafe levenshtein builders +var lb1, lb2 *lev2.LevenshteinAutomatonBuilder + type asynchSegmentResult struct { dictItr segment.DictionaryIterator @@ -40,15 +45,36 @@ type asynchSegmentResult struct { err error } +var reflectStaticSizeIndexSnapshot int + +func init() { + var is interface{} = IndexSnapshot{} + reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size()) + var err error + lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true) + if err != nil { + panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) + } + lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true) + if err != nil { + panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) + } +} + type IndexSnapshot struct { parent *Scorch segment []*SegmentSnapshot offsets []uint64 internal map[string][]byte epoch uint64 + size uint64 + creator string m sync.Mutex // Protects the fields that follow. refs int64 + + m2 sync.Mutex // Protects the fields that follow. + fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's } func (i *IndexSnapshot) Segments() []*SegmentSnapshot { @@ -85,12 +111,27 @@ func (i *IndexSnapshot) DecRef() (err error) { return err } +func (i *IndexSnapshot) Close() error { + return i.DecRef() +} + +func (i *IndexSnapshot) Size() int { + return int(i.size) +} + +func (i *IndexSnapshot) updateSize() { + i.size += uint64(reflectStaticSizeIndexSnapshot) + for _, s := range i.segment { + i.size += uint64(s.Size()) + } +} + func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { results := make(chan *asynchSegmentResult) for index, segment := range i.segment { go func(index int, segment *SegmentSnapshot) { - dict, err := segment.Dictionary(field) + dict, err := segment.segment.Dictionary(field) if err != nil { results <- &asynchSegmentResult{err: err} } else { @@ -116,7 +157,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s if next != nil { rv.cursors = append(rv.cursors, &segmentDictCursor{ itr: asr.dictItr, - curr: next, + curr: *next, }) } } @@ -151,6 +192,56 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, }) } +func (i *IndexSnapshot) FieldDictRegexp(field string, + termRegex string) (index.FieldDict, error) { + // TODO: potential optimization where the literal prefix represents the, + // entire regexp, allowing us to use PrefixIterator(prefixTerm)? + + a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex) + if err != nil { + return nil, err + } + + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.AutomatonIterator(a, prefixBeg, prefixEnd) + }) +} + +func (i *IndexSnapshot) getLevAutomaton(term string, + fuzziness uint8) (vellum.Automaton, error) { + if fuzziness == 1 { + return lb1.BuildDfa(term, fuzziness) + } else if fuzziness == 2 { + return lb2.BuildDfa(term, fuzziness) + } + return nil, fmt.Errorf("fuzziness exceeds the max limit") +} + +func (i *IndexSnapshot) FieldDictFuzzy(field string, + term string, fuzziness int, prefix string) (index.FieldDict, error) { + a, err := i.getLevAutomaton(term, uint8(fuzziness)) + if err != nil { + return nil, err + } + + var prefixBeg, prefixEnd []byte + if prefix != "" { + prefixBeg = []byte(prefix) + prefixEnd = segment.IncrementBytes(prefixBeg) + } + + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.AutomatonIterator(a, prefixBeg, prefixEnd) + }) +} + +func (i *IndexSnapshot) FieldDictOnly(field string, + onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.OnlyIterator(onlyTerms, includeCount) + }) +} + func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { results := make(chan *asynchSegmentResult) for index, segment := range i.segment { @@ -264,21 +355,26 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) rv = document.NewDocument(id) - err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { + err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool { if name == "_id" { return true } + + // copy value, array positions to preserve them beyond the scope of this callback + value := append([]byte(nil), val...) + arrayPos := append([]uint64(nil), pos...) + switch typ { case 't': - rv.AddField(document.NewTextField(name, pos, value)) + rv.AddField(document.NewTextField(name, arrayPos, value)) case 'n': - rv.AddField(document.NewNumericFieldFromBytes(name, pos, value)) + rv.AddField(document.NewNumericFieldFromBytes(name, arrayPos, value)) case 'd': - rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value)) + rv.AddField(document.NewDateTimeFieldFromBytes(name, arrayPos, value)) case 'b': - rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value)) + rv.AddField(document.NewBooleanFieldFromBytes(name, arrayPos, value)) case 'g': - rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value)) + rv.AddField(document.NewGeoPointFieldFromBytes(name, arrayPos, value)) } return true @@ -307,24 +403,15 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { } segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) - var found bool - var rv string - err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { - if field == "_id" { - found = true - rv = string(value) - return false - } - return true - }) + v, err := i.segment[segmentIndex].DocID(localDocNum) if err != nil { return "", err } - - if found { - return rv, nil + if v == nil { + return "", fmt.Errorf("document number %d not found", docNum) } - return "", fmt.Errorf("document number %d not found", docNum) + + return string(v), nil } func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) { @@ -349,33 +436,81 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { - - rv := &IndexSnapshotTermFieldReader{ - term: term, - field: field, - snapshot: i, - postings: make([]segment.PostingsList, len(i.segment)), - iterators: make([]segment.PostingsIterator, len(i.segment)), - includeFreq: includeFreq, - includeNorm: includeNorm, - includeTermVectors: includeTermVectors, + rv := i.allocTermFieldReaderDicts(field) + + rv.term = term + rv.field = field + rv.snapshot = i + if rv.postings == nil { + rv.postings = make([]segment.PostingsList, len(i.segment)) + } + if rv.iterators == nil { + rv.iterators = make([]segment.PostingsIterator, len(i.segment)) + } + rv.segmentOffset = 0 + rv.includeFreq = includeFreq + rv.includeNorm = includeNorm + rv.includeTermVectors = includeTermVectors + rv.currPosting = nil + rv.currID = rv.currID[:0] + + if rv.dicts == nil { + rv.dicts = make([]segment.TermDictionary, len(i.segment)) + for i, segment := range i.segment { + dict, err := segment.segment.Dictionary(field) + if err != nil { + return nil, err + } + rv.dicts[i] = dict + } } + for i, segment := range i.segment { - dict, err := segment.Dictionary(field) - if err != nil { - return nil, err - } - pl, err := dict.PostingsList(string(term), nil) + pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i]) if err != nil { return nil, err } rv.postings[i] = pl - rv.iterators[i] = pl.Iterator() + rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i]) } - atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1)) + atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1)) return rv, nil } +func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) { + i.m2.Lock() + if i.fieldTFRs != nil { + tfrs := i.fieldTFRs[field] + last := len(tfrs) - 1 + if last >= 0 { + tfr = tfrs[last] + tfrs[last] = nil + i.fieldTFRs[field] = tfrs[:last] + i.m2.Unlock() + return + } + } + i.m2.Unlock() + return &IndexSnapshotTermFieldReader{} +} + +func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { + i.parent.rootLock.RLock() + obsolete := i.parent.root != i + i.parent.rootLock.RUnlock() + if obsolete { + // if we're not the current root (mutations happened), don't bother recycling + return + } + + i.m2.Lock() + if i.fieldTFRs == nil { + i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{} + } + i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr) + i.m2.Unlock() +} + func docNumberToBytes(buf []byte, in uint64) []byte { if len(buf) != 8 { if cap(buf) >= 8 { @@ -389,115 +524,172 @@ func docNumberToBytes(buf []byte, in uint64) []byte { } func docInternalToNumber(in index.IndexInternalID) (uint64, error) { - var res uint64 - err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res) - if err != nil { - return 0, err + if len(in) != 8 { + return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in) } - return res, nil + return binary.BigEndian.Uint64(in), nil } func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, visitor index.DocumentFieldTermVisitor) error { + _, err := i.documentVisitFieldTerms(id, fields, visitor, nil) + return err +} +func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, + fields []string, visitor index.DocumentFieldTermVisitor, + dvs segment.DocVisitState) (segment.DocVisitState, error) { docNum, err := docInternalToNumber(id) if err != nil { - return err + return nil, err } + segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) if segmentIndex >= len(i.segment) { - return nil + return nil, nil } + _, dvs, err = i.documentVisitFieldTermsOnSegment( + segmentIndex, localDocNum, fields, nil, visitor, dvs) + + return dvs, err +} + +func (i *IndexSnapshot) documentVisitFieldTermsOnSegment( + segmentIndex int, localDocNum uint64, fields []string, cFields []string, + visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) ( + cFieldsOut []string, dvsOut segment.DocVisitState, err error) { ss := i.segment[segmentIndex] - if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { - // get the list of doc value persisted fields - pFields, err := zaps.VisitableDocValueFields() + var vFields []string // fields that are visitable via the segment + + ssv, ssvOk := ss.segment.(segment.DocumentFieldTermVisitable) + if ssvOk && ssv != nil { + vFields, err = ssv.VisitableDocValueFields() if err != nil { - return err - } - // assort the fields for which terms look up have to - // be performed runtime - dvPendingFields := extractDvPendingFields(fields, pFields) - if len(dvPendingFields) == 0 { - // all fields are doc value persisted - return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + return nil, nil, err } + } - // concurrently trigger the runtime doc value preparations for - // pending fields as well as the visit of the persisted doc values - errCh := make(chan error, 1) + var errCh chan error - go func() { - defer close(errCh) - err := ss.cachedDocs.prepareFields(fields, ss) - if err != nil { - errCh <- err - } - }() + // cFields represents the fields that we'll need from the + // cachedDocs, and might be optionally be provided by the caller, + // if the caller happens to know we're on the same segmentIndex + // from a previous invocation + if cFields == nil { + cFields = subtractStrings(fields, vFields) + + if !ss.cachedDocs.hasFields(cFields) { + errCh = make(chan error, 1) + + go func() { + err := ss.cachedDocs.prepareFields(cFields, ss) + if err != nil { + errCh <- err + } + close(errCh) + }() + } + } - // visit the persisted dv while the cache preparation is in progress - err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + if ssvOk && ssv != nil && len(vFields) > 0 { + dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) if err != nil { - return err + return nil, nil, err } + } - // err out if fieldCache preparation failed + if errCh != nil { err = <-errCh if err != nil { - return err + return nil, nil, err } + } - visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor) - return nil + if len(cFields) > 0 { + ss.cachedDocs.visitDoc(localDocNum, cFields, visitor) } - return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor) + return cFields, dvs, nil +} + +func (i *IndexSnapshot) DocValueReader(fields []string) ( + index.DocValueReader, error) { + return &DocValueReader{i: i, fields: fields, currSegmentIndex: -1}, nil +} + +type DocValueReader struct { + i *IndexSnapshot + fields []string + dvs segment.DocVisitState + + currSegmentIndex int + currCachedFields []string } -func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string, - ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error { - err := ss.cachedDocs.prepareFields(fields, ss) +func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, + visitor index.DocumentFieldTermVisitor) (err error) { + docNum, err := docInternalToNumber(id) if err != nil { return err } - visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor) - return nil + segmentIndex, localDocNum := dvr.i.segmentIndexAndLocalDocNumFromGlobal(docNum) + if segmentIndex >= len(dvr.i.segment) { + return nil + } + + if dvr.currSegmentIndex != segmentIndex { + dvr.currSegmentIndex = segmentIndex + dvr.currCachedFields = nil + } + + dvr.currCachedFields, dvr.dvs, err = dvr.i.documentVisitFieldTermsOnSegment( + dvr.currSegmentIndex, localDocNum, dvr.fields, dvr.currCachedFields, visitor, dvr.dvs) + + return err } -func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string, - ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) { +func (i *IndexSnapshot) DumpAll() chan interface{} { + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv +} - for _, field := range fields { - if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { - if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { - for { - i := bytes.Index(tlist, TermSeparatorSplitSlice) - if i < 0 { - break - } - visitor(field, tlist[0:i]) - tlist = tlist[i+1:] - } - } - } - } +func (i *IndexSnapshot) DumpDoc(id string) chan interface{} { + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv +} +func (i *IndexSnapshot) DumpFields() chan interface{} { + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv } -func extractDvPendingFields(requestedFields, persistedFields []string) []string { - removeMap := map[string]struct{}{} - for _, str := range persistedFields { - removeMap[str] = struct{}{} +// subtractStrings returns set a minus elements of set b. +func subtractStrings(a, b []string) []string { + if len(b) == 0 { + return a } - rv := make([]string, 0, len(requestedFields)) - for _, s := range requestedFields { - if _, ok := removeMap[s]; !ok { - rv = append(rv, s) + rv := make([]string, 0, len(a)) +OUTER: + for _, as := range a { + for _, bs := range b { + if as == bs { + continue OUTER + } } + rv = append(rv, as) } return rv } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_dict.go b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_dict.go index 3c902cad6..abd3bde8c 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_dict.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_dict.go @@ -23,12 +23,13 @@ import ( type segmentDictCursor struct { itr segment.DictionaryIterator - curr *index.DictEntry + curr index.DictEntry } type IndexSnapshotFieldDict struct { snapshot *IndexSnapshot cursors []*segmentDictCursor + entry index.DictEntry } func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) } @@ -51,10 +52,10 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} { } func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { - if len(i.cursors) <= 0 { + if len(i.cursors) == 0 { return nil, nil } - rv := i.cursors[0].curr + i.entry = i.cursors[0].curr next, err := i.cursors[0].itr.Next() if err != nil { return nil, err @@ -64,12 +65,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { heap.Pop(i) } else { // modified heap, fix it - i.cursors[0].curr = next + i.cursors[0].curr = *next heap.Fix(i, 0) } // look for any other entries with the exact same term - for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term { - rv.Count += i.cursors[0].curr.Count + for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term { + i.entry.Count += i.cursors[0].curr.Count next, err := i.cursors[0].itr.Next() if err != nil { return nil, err @@ -79,12 +80,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { heap.Pop(i) } else { // modified heap, fix it - i.cursors[0].curr = next + i.cursors[0].curr = *next heap.Fix(i, 0) } } - return rv, nil + return &i.entry, nil } func (i *IndexSnapshotFieldDict) Close() error { diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_doc.go b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_doc.go index d1205ff8e..27da20865 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_doc.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_doc.go @@ -16,17 +16,30 @@ package scorch import ( "bytes" + "reflect" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeIndexSnapshotDocIDReader int + +func init() { + var isdr IndexSnapshotDocIDReader + reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size()) +} + type IndexSnapshotDocIDReader struct { snapshot *IndexSnapshot iterators []roaring.IntIterable segmentOffset int } +func (i *IndexSnapshotDocIDReader) Size() int { + return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr +} + func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { for i.segmentOffset < len(i.iterators) { if !i.iterators[i.segmentOffset].HasNext() { diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go index 87fd0d14f..5d56f1944 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go @@ -16,16 +16,27 @@ package scorch import ( "bytes" + "fmt" + "reflect" "sync/atomic" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeIndexSnapshotTermFieldReader int + +func init() { + var istfr IndexSnapshotTermFieldReader + reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size()) +} + type IndexSnapshotTermFieldReader struct { term []byte field string snapshot *IndexSnapshot + dicts []segment.TermDictionary postings []segment.PostingsList iterators []segment.PostingsIterator segmentOffset int @@ -36,13 +47,34 @@ type IndexSnapshotTermFieldReader struct { currID index.IndexInternalID } +func (i *IndexSnapshotTermFieldReader) Size() int { + sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr + + len(i.term) + + len(i.field) + + len(i.currID) + + for _, entry := range i.postings { + sizeInBytes += entry.Size() + } + + for _, entry := range i.iterators { + sizeInBytes += entry.Size() + } + + if i.currPosting != nil { + sizeInBytes += i.currPosting.Size() + } + + return sizeInBytes +} + func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { rv := preAlloced if rv == nil { rv = &index.TermFieldDoc{} } // find the next hit - for i.segmentOffset < len(i.postings) { + for i.segmentOffset < len(i.iterators) { next, err := i.iterators[i.segmentOffset].Next() if err != nil { return nil, err @@ -72,9 +104,16 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin } if i.includeTermVectors { locs := next.Locations() - rv.Vectors = make([]*index.TermFieldVector, len(locs)) + if cap(rv.Vectors) < len(locs) { + rv.Vectors = make([]*index.TermFieldVector, len(locs)) + backing := make([]index.TermFieldVector, len(locs)) + for i := range backing { + rv.Vectors[i] = &backing[i] + } + } + rv.Vectors = rv.Vectors[:len(locs)] for i, loc := range locs { - rv.Vectors[i] = &index.TermFieldVector{ + *rv.Vectors[i] = index.TermFieldVector{ Start: loc.Start(), End: loc.End(), Pos: loc.Pos(), @@ -96,24 +135,37 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo } *i = *(i2.(*IndexSnapshotTermFieldReader)) } - // FIXME do something better - next, err := i.Next(preAlloced) + num, err := docInternalToNumber(ID) + if err != nil { + return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) + } + segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num) + if segIndex >= len(i.snapshot.segment) { + return nil, fmt.Errorf("computed segment index %d out of bounds %d", + segIndex, len(i.snapshot.segment)) + } + // skip directly to the target segment + i.segmentOffset = segIndex + next, err := i.iterators[i.segmentOffset].Advance(ldocNum) if err != nil { return nil, err } if next == nil { - return nil, nil + // we jumped directly to the segment that should have contained it + // but it wasn't there, so reuse Next() which should correctly + // get the next hit after it (we moved i.segmentOffset) + return i.Next(preAlloced) } - for bytes.Compare(next.ID, ID) < 0 { - next, err = i.Next(preAlloced) - if err != nil { - return nil, err - } - if next == nil { - break - } + + if preAlloced == nil { + preAlloced = &index.TermFieldDoc{} } - return next, nil + preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + i.snapshot.offsets[segIndex]) + i.postingToTermFieldDoc(next, preAlloced) + i.currID = preAlloced.ID + i.currPosting = next + return preAlloced, nil } func (i *IndexSnapshotTermFieldReader) Count() uint64 { @@ -126,7 +178,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 { func (i *IndexSnapshotTermFieldReader) Close() error { if i.snapshot != nil { - atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1)) + atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1)) + i.snapshot.recycleTermFieldReader(i) } return nil } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go index 247003311..470868d0e 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go @@ -19,7 +19,7 @@ import ( "log" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) type RollbackPoint struct { diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go index 5e64cb1f2..f3a2c56a9 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go @@ -15,42 +15,25 @@ package scorch import ( + "bytes" "sync" + "sync/atomic" "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) var TermSeparator byte = 0xff var TermSeparatorSplitSlice = []byte{TermSeparator} -type SegmentDictionarySnapshot struct { - s *SegmentSnapshot - d segment.TermDictionary -} - -func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { - // TODO: if except is non-nil, perhaps need to OR it with s.s.deleted? - return s.d.PostingsList(term, s.s.deleted) -} - -func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator { - return s.d.Iterator() -} - -func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator { - return s.d.PrefixIterator(prefix) -} - -func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator { - return s.d.RangeIterator(start, end) -} - type SegmentSnapshot struct { id uint64 segment segment.Segment deleted *roaring.Bitmap + creator string cachedDocs *cachedDocs } @@ -83,8 +66,11 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel return s.segment.VisitDocument(num, visitor) } -func (s *SegmentSnapshot) Count() uint64 { +func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) { + return s.segment.DocID(num) +} +func (s *SegmentSnapshot) Count() uint64 { rv := s.segment.Count() if s.deleted != nil { rv -= s.deleted.GetCardinality() @@ -92,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 { return rv } -func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) { - d, err := s.segment.Dictionary(field) - if err != nil { - return nil, err - } - return &SegmentDictionarySnapshot{ - s: s, - d: d, - }, nil -} - func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { rv, err := s.segment.DocNumbers(docIDs) if err != nil { @@ -114,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { return rv, nil } -// DocNumbersLive returns bitsit containing doc numbers for all live docs +// DocNumbersLive returns a bitmap containing doc numbers for all live docs func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap { rv := roaring.NewBitmap() rv.AddRange(0, s.segment.Count()) @@ -128,36 +103,68 @@ func (s *SegmentSnapshot) Fields() []string { return s.segment.Fields() } +func (s *SegmentSnapshot) Size() (rv int) { + rv = s.segment.Size() + if s.deleted != nil { + rv += int(s.deleted.GetSizeInBytes()) + } + rv += s.cachedDocs.Size() + return +} + type cachedFieldDocs struct { + m sync.Mutex readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used. err error // Non-nil if there was an error when preparing this cachedFieldDocs. docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. + size uint64 } -func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { - defer close(cfd.readyCh) +func (cfd *cachedFieldDocs) Size() int { + var rv int + cfd.m.Lock() + for _, entry := range cfd.docs { + rv += 8 /* size of uint64 */ + len(entry) + } + cfd.m.Unlock() + return rv +} + +func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { + cfd.m.Lock() + defer func() { + close(cfd.readyCh) + cfd.m.Unlock() + }() + cfd.size += uint64(size.SizeOfUint64) /* size field */ dict, err := ss.segment.Dictionary(field) if err != nil { cfd.err = err return } + var postings segment.PostingsList + var postingsItr segment.PostingsIterator + dictItr := dict.Iterator() next, err := dictItr.Next() for err == nil && next != nil { - postings, err1 := dict.PostingsList(next.Term, nil) + var err1 error + postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings) if err1 != nil { cfd.err = err1 return } - postingsItr := postings.Iterator() + cfd.size += uint64(size.SizeOfUint64) /* map key */ + postingsItr = postings.Iterator(false, false, false, postingsItr) nextPosting, err2 := postingsItr.Next() for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) + cfd.size += uint64(len(next.Term) + 1) // map value nextPosting, err2 = postingsItr.Next() } @@ -178,10 +185,12 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { type cachedDocs struct { m sync.Mutex // As the cache is asynchronously prepared, need a lock cache map[string]*cachedFieldDocs // Keyed by field + size uint64 } func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error { c.m.Lock() + if c.cache == nil { c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields())) } @@ -194,7 +203,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e docs: make(map[uint64][]byte), } - go c.cache[field].prepareFields(field, ss) + go c.cache[field].prepareField(field, ss) } } @@ -209,21 +218,62 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e c.m.Lock() } + c.updateSizeLOCKED() + c.m.Unlock() return nil } -func (c *cachedDocs) sizeInBytes() uint64 { - sizeInBytes := 0 +// hasFields returns true if the cache has all the given fields +func (c *cachedDocs) hasFields(fields []string) bool { c.m.Lock() + for _, field := range fields { + if _, exists := c.cache[field]; !exists { + c.m.Unlock() + return false // found a field not in cache + } + } + c.m.Unlock() + return true +} + +func (c *cachedDocs) Size() int { + return int(atomic.LoadUint64(&c.size)) +} + +func (c *cachedDocs) updateSizeLOCKED() { + sizeInBytes := 0 for k, v := range c.cache { // cachedFieldDocs sizeInBytes += len(k) if v != nil { - for _, entry := range v.docs { // docs - sizeInBytes += 8 /* size of uint64 */ + len(entry) + sizeInBytes += v.Size() + } + } + atomic.StoreUint64(&c.size, uint64(sizeInBytes)) +} + +func (c *cachedDocs) visitDoc(localDocNum uint64, + fields []string, visitor index.DocumentFieldTermVisitor) { + c.m.Lock() + + for _, field := range fields { + if cachedFieldDocs, exists := c.cache[field]; exists { + c.m.Unlock() + <-cachedFieldDocs.readyCh + c.m.Lock() + + if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { + for { + i := bytes.Index(tlist, TermSeparatorSplitSlice) + if i < 0 { + break + } + visitor(field, tlist[0:i]) + tlist = tlist[i+1:] + } } } } + c.m.Unlock() - return uint64(sizeInBytes) } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/stats.go b/vendor/github.com/blevesearch/bleve/index/scorch/stats.go index c44a977bf..2eb832f2c 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/stats.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/stats.go @@ -16,63 +16,125 @@ package scorch import ( "encoding/json" - "io/ioutil" + "reflect" "sync/atomic" ) -// Stats tracks statistics about the index +// Stats tracks statistics about the index, fields that are +// prefixed like CurXxxx are gauges (can go up and down), +// and fields that are prefixed like TotXxxx are monotonically +// increasing counters. type Stats struct { - updates, deletes, batches, errors uint64 - analysisTime, indexTime uint64 - termSearchersStarted uint64 - termSearchersFinished uint64 - numPlainTextBytesIndexed uint64 - numItemsIntroduced uint64 - numItemsPersisted uint64 - i *Scorch -} + TotUpdates uint64 + TotDeletes uint64 -func (s *Stats) statsMap() (map[string]interface{}, error) { - m := map[string]interface{}{} - m["updates"] = atomic.LoadUint64(&s.updates) - m["deletes"] = atomic.LoadUint64(&s.deletes) - m["batches"] = atomic.LoadUint64(&s.batches) - m["errors"] = atomic.LoadUint64(&s.errors) - m["analysis_time"] = atomic.LoadUint64(&s.analysisTime) - m["index_time"] = atomic.LoadUint64(&s.indexTime) - m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted) - m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished) - m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed) - m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced) - m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted) - - if s.i.path != "" { - finfos, err := ioutil.ReadDir(s.i.path) - if err != nil { - return nil, err - } + TotBatches uint64 + TotBatchesEmpty uint64 + TotBatchIntroTime uint64 + MaxBatchIntroTime uint64 - var numFilesOnDisk, numBytesUsedDisk uint64 + CurRootEpoch uint64 + LastPersistedEpoch uint64 + LastMergedEpoch uint64 - for _, finfo := range finfos { - if !finfo.IsDir() { - numBytesUsedDisk += uint64(finfo.Size()) - numFilesOnDisk++ - } - } + TotOnErrors uint64 - m["num_bytes_used_disk"] = numBytesUsedDisk - m["num_files_on_disk"] = numFilesOnDisk - } + TotAnalysisTime uint64 + TotIndexTime uint64 + + TotIndexedPlainTextBytes uint64 + + TotTermSearchersStarted uint64 + TotTermSearchersFinished uint64 + + TotIntroduceLoop uint64 + TotIntroduceSegmentBeg uint64 + TotIntroduceSegmentEnd uint64 + TotIntroducePersistBeg uint64 + TotIntroducePersistEnd uint64 + TotIntroduceMergeBeg uint64 + TotIntroduceMergeEnd uint64 + TotIntroduceRevertBeg uint64 + TotIntroduceRevertEnd uint64 + + TotIntroducedItems uint64 + TotIntroducedSegmentsBatch uint64 + TotIntroducedSegmentsMerge uint64 + + TotPersistLoopBeg uint64 + TotPersistLoopErr uint64 + TotPersistLoopProgress uint64 + TotPersistLoopWait uint64 + TotPersistLoopWaitNotified uint64 + TotPersistLoopEnd uint64 + + TotPersistedItems uint64 + TotItemsToPersist uint64 + TotPersistedSegments uint64 + + TotPersisterSlowMergerPause uint64 + TotPersisterSlowMergerResume uint64 + + TotPersisterNapPauseCompleted uint64 + TotPersisterMergerNapBreak uint64 - return m, nil + TotFileMergeLoopBeg uint64 + TotFileMergeLoopErr uint64 + TotFileMergeLoopEnd uint64 + + TotFileMergePlan uint64 + TotFileMergePlanErr uint64 + TotFileMergePlanNone uint64 + TotFileMergePlanOk uint64 + + TotFileMergePlanTasks uint64 + TotFileMergePlanTasksDone uint64 + TotFileMergePlanTasksErr uint64 + TotFileMergePlanTasksSegments uint64 + TotFileMergePlanTasksSegmentsEmpty uint64 + + TotFileMergeSegmentsEmpty uint64 + TotFileMergeSegments uint64 + TotFileSegmentsAtRoot uint64 + TotFileMergeWrittenBytes uint64 + + TotFileMergeZapBeg uint64 + TotFileMergeZapEnd uint64 + TotFileMergeZapTime uint64 + MaxFileMergeZapTime uint64 + + TotFileMergeIntroductions uint64 + TotFileMergeIntroductionsDone uint64 + TotFileMergeIntroductionsSkipped uint64 + + TotMemMergeBeg uint64 + TotMemMergeErr uint64 + TotMemMergeDone uint64 + TotMemMergeZapBeg uint64 + TotMemMergeZapEnd uint64 + TotMemMergeZapTime uint64 + MaxMemMergeZapTime uint64 + TotMemMergeSegments uint64 + TotMemorySegmentsAtRoot uint64 } -// MarshalJSON implements json.Marshaler -func (s *Stats) MarshalJSON() ([]byte, error) { - m, err := s.statsMap() - if err != nil { - return nil, err +// atomically populates the returned map +func (s *Stats) ToMap() map[string]interface{} { + m := map[string]interface{}{} + sve := reflect.ValueOf(s).Elem() + svet := sve.Type() + for i := 0; i < svet.NumField(); i++ { + svef := sve.Field(i) + if svef.CanAddr() { + svefp := svef.Addr().Interface() + m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64)) + } } - return json.Marshal(m) + return m +} + +// MarshalJSON implements json.Marshaler, and in contrast to standard +// json marshaling provides atomic safety +func (s *Stats) MarshalJSON() ([]byte, error) { + return json.Marshal(s.ToMap()) } diff --git a/vendor/github.com/blevesearch/bleve/index/store/boltdb/iterator.go b/vendor/github.com/blevesearch/bleve/index/store/boltdb/iterator.go index 82ab946fd..4b5019f1f 100644 --- a/vendor/github.com/blevesearch/bleve/index/store/boltdb/iterator.go +++ b/vendor/github.com/blevesearch/bleve/index/store/boltdb/iterator.go @@ -17,7 +17,7 @@ package boltdb import ( "bytes" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) type Iterator struct { diff --git a/vendor/github.com/blevesearch/bleve/index/store/boltdb/reader.go b/vendor/github.com/blevesearch/bleve/index/store/boltdb/reader.go index 1d701c982..4cd94183c 100644 --- a/vendor/github.com/blevesearch/bleve/index/store/boltdb/reader.go +++ b/vendor/github.com/blevesearch/bleve/index/store/boltdb/reader.go @@ -16,7 +16,7 @@ package boltdb import ( "github.com/blevesearch/bleve/index/store" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) type Reader struct { diff --git a/vendor/github.com/blevesearch/bleve/index/store/boltdb/store.go b/vendor/github.com/blevesearch/bleve/index/store/boltdb/store.go index d8de0768f..56613d531 100644 --- a/vendor/github.com/blevesearch/bleve/index/store/boltdb/store.go +++ b/vendor/github.com/blevesearch/bleve/index/store/boltdb/store.go @@ -30,7 +30,7 @@ import ( "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) const ( @@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore, bo.ReadOnly = ro } + if initialMmapSize, ok := config["initialMmapSize"].(int); ok { + bo.InitialMmapSize = initialMmapSize + } else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok { + bo.InitialMmapSize = int(initialMmapSize) + } + db, err := bolt.Open(path, 0600, bo) if err != nil { return nil, err diff --git a/vendor/github.com/blevesearch/bleve/index/upsidedown/index_reader.go b/vendor/github.com/blevesearch/bleve/index/upsidedown/index_reader.go index 77d523c30..ea7243eaa 100644 --- a/vendor/github.com/blevesearch/bleve/index/upsidedown/index_reader.go +++ b/vendor/github.com/blevesearch/bleve/index/upsidedown/index_reader.go @@ -15,11 +15,20 @@ package upsidedown import ( + "reflect" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" ) +var reflectStaticSizeIndexReader int + +func init() { + var ir IndexReader + reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size()) +} + type IndexReader struct { index *UpsideDownCouch kvreader store.KVReader @@ -201,3 +210,17 @@ func incrementBytes(in []byte) []byte { } return rv } + +func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) { + return &DocValueReader{i: i, fields: fields}, nil +} + +type DocValueReader struct { + i *IndexReader + fields []string +} + +func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, + visitor index.DocumentFieldTermVisitor) error { + return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor) +} diff --git a/vendor/github.com/blevesearch/bleve/index/upsidedown/reader.go b/vendor/github.com/blevesearch/bleve/index/upsidedown/reader.go index 1f40c02de..bc0fef119 100644 --- a/vendor/github.com/blevesearch/bleve/index/upsidedown/reader.go +++ b/vendor/github.com/blevesearch/bleve/index/upsidedown/reader.go @@ -16,13 +16,27 @@ package upsidedown import ( "bytes" + "reflect" "sort" "sync/atomic" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeUpsideDownCouchTermFieldReader int +var reflectStaticSizeUpsideDownCouchDocIDReader int + +func init() { + var tfr UpsideDownCouchTermFieldReader + reflectStaticSizeUpsideDownCouchTermFieldReader = + int(reflect.TypeOf(tfr).Size()) + var cdr UpsideDownCouchDocIDReader + reflectStaticSizeUpsideDownCouchDocIDReader = + int(reflect.TypeOf(cdr).Size()) +} + type UpsideDownCouchTermFieldReader struct { count uint64 indexReader *IndexReader @@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct { includeTermVectors bool } +func (r *UpsideDownCouchTermFieldReader) Size() int { + sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr + + len(r.term) + + r.tfrPrealloc.Size() + + len(r.keyBuf) + + if r.tfrNext != nil { + sizeInBytes += r.tfrNext.Size() + } + + return sizeInBytes +} + func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) { bufNeeded := termFrequencyRowKeySize(term, nil) if bufNeeded < dictionaryRowKeySize(term) { @@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct { onlyMode bool } -func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { +func (r *UpsideDownCouchDocIDReader) Size() int { + sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader + + reflectStaticSizeIndexReader + size.SizeOfPtr + + for _, entry := range r.only { + sizeInBytes += size.SizeOfString + len(entry) + } + return sizeInBytes +} + +func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { startBytes := []byte{0x0} endBytes := []byte{0xff} diff --git a/vendor/github.com/blevesearch/bleve/index/upsidedown/row.go b/vendor/github.com/blevesearch/bleve/index/upsidedown/row.go index 7e503ae05..531e0a0d3 100644 --- a/vendor/github.com/blevesearch/bleve/index/upsidedown/row.go +++ b/vendor/github.com/blevesearch/bleve/index/upsidedown/row.go @@ -20,10 +20,22 @@ import ( "fmt" "io" "math" + "reflect" + "github.com/blevesearch/bleve/size" "github.com/golang/protobuf/proto" ) +var reflectStaticSizeTermFrequencyRow int +var reflectStaticSizeTermVector int + +func init() { + var tfr TermFrequencyRow + reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size()) + var tv TermVector + reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size()) +} + const ByteSeparator byte = 0xff type UpsideDownCouchRowStream chan UpsideDownCouchRow @@ -358,6 +370,11 @@ type TermVector struct { end uint64 } +func (tv *TermVector) Size() int { + return reflectStaticSizeTermVector + size.SizeOfPtr + + len(tv.arrayPositions)*size.SizeOfUint64 +} + func (tv *TermVector) String() string { return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions) } @@ -371,6 +388,18 @@ type TermFrequencyRow struct { field uint16 } +func (tfr *TermFrequencyRow) Size() int { + sizeInBytes := reflectStaticSizeTermFrequencyRow + + len(tfr.term) + + len(tfr.doc) + + for _, entry := range tfr.vectors { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + func (tfr *TermFrequencyRow) Term() []byte { return tfr.term } @@ -555,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error { func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error { tfr.doc = key[3+len(term)+1:] - if len(tfr.doc) <= 0 { + if len(tfr.doc) == 0 { return fmt.Errorf("invalid term frequency key, empty docid") } diff --git a/vendor/github.com/blevesearch/bleve/index/upsidedown/upsidedown.go b/vendor/github.com/blevesearch/bleve/index/upsidedown/upsidedown.go index 70e6e457f..e4bc3d8f0 100644 --- a/vendor/github.com/blevesearch/bleve/index/upsidedown/upsidedown.go +++ b/vendor/github.com/blevesearch/bleve/index/upsidedown/upsidedown.go @@ -775,7 +775,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis. } func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector { - if len(in) <= 0 { + if len(in) == 0 { return nil } @@ -810,15 +810,17 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { } } - go func() { - for _, doc := range batch.IndexOps { - if doc != nil { - aw := index.NewAnalysisWork(udc, doc, resultChan) - // put the work on the queue - udc.analysisQueue.Queue(aw) + if len(batch.IndexOps) > 0 { + go func() { + for _, doc := range batch.IndexOps { + if doc != nil { + aw := index.NewAnalysisWork(udc, doc, resultChan) + // put the work on the queue + udc.analysisQueue.Queue(aw) + } } - } - }() + }() + } // retrieve back index rows concurrent with analysis docBackIndexRowErr := error(nil) @@ -958,6 +960,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { } else { atomic.AddUint64(&udc.stats.errors, 1) } + + persistedCallback := batch.PersistedCallback() + if persistedCallback != nil { + persistedCallback(err) + } return } diff --git a/vendor/github.com/blevesearch/bleve/index_alias_impl.go b/vendor/github.com/blevesearch/bleve/index_alias_impl.go index f678a059b..335fcade2 100644 --- a/vendor/github.com/blevesearch/bleve/index_alias_impl.go +++ b/vendor/github.com/blevesearch/bleve/index_alias_impl.go @@ -433,6 +433,7 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest { Explain: req.Explain, Sort: req.Sort.Copy(), IncludeLocations: req.IncludeLocations, + Score: req.Score, } return &rv } diff --git a/vendor/github.com/blevesearch/bleve/index_impl.go b/vendor/github.com/blevesearch/bleve/index_impl.go index caea1b8e0..fe61b8064 100644 --- a/vendor/github.com/blevesearch/bleve/index_impl.go +++ b/vendor/github.com/blevesearch/bleve/index_impl.go @@ -50,6 +50,12 @@ const storePath = "store" var mappingInternalKey = []byte("_mapping") +const SearchQueryStartCallbackKey = "_search_query_start_callback_key" +const SearchQueryEndCallbackKey = "_search_query_end_callback_key" + +type SearchQueryStartCallbackFn func(size uint64) error +type SearchQueryEndCallbackFn func(size uint64) error + func indexStorePath(path string) string { return path + string(os.PathSeparator) + storePath } @@ -362,8 +368,70 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) { return i.SearchInContext(context.Background(), req) } +var documentMatchEmptySize int +var searchContextEmptySize int +var facetResultEmptySize int +var documentEmptySize int + +func init() { + var dm search.DocumentMatch + documentMatchEmptySize = dm.Size() + + var sc search.SearchContext + searchContextEmptySize = sc.Size() + + var fr search.FacetResult + facetResultEmptySize = fr.Size() + + var d document.Document + documentEmptySize = d.Size() +} + +// memNeededForSearch is a helper function that returns an estimate of RAM +// needed to execute a search request. +func memNeededForSearch(req *SearchRequest, + searcher search.Searcher, + topnCollector *collector.TopNCollector) uint64 { + + backingSize := req.Size + req.From + 1 + if req.Size+req.From > collector.PreAllocSizeSkipCap { + backingSize = collector.PreAllocSizeSkipCap + 1 + } + numDocMatches := backingSize + searcher.DocumentMatchPoolSize() + + estimate := 0 + + // overhead, size in bytes from collector + estimate += topnCollector.Size() + + // pre-allocing DocumentMatchPool + estimate += searchContextEmptySize + numDocMatches*documentMatchEmptySize + + // searcher overhead + estimate += searcher.Size() + + // overhead from results, lowestMatchOutsideResults + estimate += (numDocMatches + 1) * documentMatchEmptySize + + // additional overhead from SearchResult + estimate += reflectStaticSizeSearchResult + reflectStaticSizeSearchStatus + + // overhead from facet results + if req.Facets != nil { + estimate += len(req.Facets) * facetResultEmptySize + } + + // highlighting, store + if len(req.Fields) > 0 || req.Highlight != nil { + // Size + From => number of hits + estimate += (req.Size + req.From) * documentEmptySize + } + + return uint64(estimate) +} + // SearchInContext executes a search request operation within the provided -// Context. Returns a SearchResult object or an error. +// Context. Returns a SearchResult object or an error. func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) { i.mutex.RLock() defer i.mutex.RUnlock() @@ -390,6 +458,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{ Explain: req.Explain, IncludeTermVectors: req.IncludeLocations || req.Highlight != nil, + Score: req.Score, }) if err != nil { return nil, err @@ -428,6 +497,24 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr collector.SetFacetsBuilder(facetsBuilder) } + memNeeded := memNeededForSearch(req, searcher, collector) + if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil { + if cbF, ok := cb.(SearchQueryStartCallbackFn); ok { + err = cbF(memNeeded) + } + } + if err != nil { + return nil, err + } + + if cb := ctx.Value(SearchQueryEndCallbackKey); cb != nil { + if cbF, ok := cb.(SearchQueryEndCallbackFn); ok { + defer func() { + _ = cbF(memNeeded) + }() + } + } + err = collector.Collect(ctx, searcher, indexReader) if err != nil { return nil, err @@ -459,7 +546,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr doc, err := indexReader.Document(hit.ID) if err == nil && doc != nil { if len(req.Fields) > 0 { - for _, f := range req.Fields { + fieldsToLoad := deDuplicate(req.Fields) + for _, f := range fieldsToLoad { for _, docF := range doc.Fields { if f == "*" || docF.Name() == f { var value interface{} @@ -533,9 +621,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr return &SearchResult{ Status: &SearchStatus{ Total: 1, - Failed: 0, Successful: 1, - Errors: make(map[string]error), }, Request: req, Hits: hits, @@ -755,3 +841,16 @@ func (f *indexImplFieldDict) Close() error { } return f.indexReader.Close() } + +// helper function to remove duplicate entries from slice of strings +func deDuplicate(fields []string) []string { + entries := make(map[string]struct{}) + ret := []string{} + for _, entry := range fields { + if _, exists := entries[entry]; !exists { + entries[entry] = struct{}{} + ret = append(ret, entry) + } + } + return ret +} diff --git a/vendor/github.com/blevesearch/bleve/index_meta.go b/vendor/github.com/blevesearch/bleve/index_meta.go index 95592a65d..d814799a8 100644 --- a/vendor/github.com/blevesearch/bleve/index_meta.go +++ b/vendor/github.com/blevesearch/bleve/index_meta.go @@ -18,6 +18,7 @@ import ( "encoding/json" "io/ioutil" "os" + "path/filepath" "github.com/blevesearch/bleve/index/upsidedown" ) @@ -92,5 +93,5 @@ func (i *indexMeta) Save(path string) (err error) { } func indexMetaPath(path string) string { - return path + string(os.PathSeparator) + metaFilename + return filepath.Join(path, metaFilename) } diff --git a/vendor/github.com/blevesearch/bleve/mapping/document.go b/vendor/github.com/blevesearch/bleve/mapping/document.go index 6ec0c66bb..f950b59be 100644 --- a/vendor/github.com/blevesearch/bleve/mapping/document.go +++ b/vendor/github.com/blevesearch/bleve/mapping/document.go @@ -42,7 +42,7 @@ type DocumentMapping struct { Dynamic bool `json:"dynamic"` Properties map[string]*DocumentMapping `json:"properties,omitempty"` Fields []*FieldMapping `json:"fields,omitempty"` - DefaultAnalyzer string `json:"default_analyzer"` + DefaultAnalyzer string `json:"default_analyzer,omitempty"` // StructTagKey overrides "json" when looking for field names in struct tags StructTagKey string `json:"struct_tag_key,omitempty"` @@ -324,13 +324,17 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { } func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { - // allow default "json" tag to be overriden + // allow default "json" tag to be overridden structTagKey := dm.StructTagKey if structTagKey == "" { structTagKey = "json" } val := reflect.ValueOf(data) + if !val.IsValid() { + return + } + typ := val.Type() switch typ.Kind() { case reflect.Map: @@ -420,7 +424,11 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, if subDocMapping != nil { // index by explicit mapping for _, fieldMapping := range subDocMapping.Fields { - fieldMapping.processString(propertyValueString, pathString, path, indexes, context) + if fieldMapping.Type == "geopoint" { + fieldMapping.processGeoPoint(property, pathString, path, indexes, context) + } else { + fieldMapping.processString(propertyValueString, pathString, path, indexes, context) + } } } else if closestDocMapping.Dynamic { // automatic indexing behavior diff --git a/vendor/github.com/blevesearch/bleve/mapping/index.go b/vendor/github.com/blevesearch/bleve/mapping/index.go index fc5d12a73..602764cbb 100644 --- a/vendor/github.com/blevesearch/bleve/mapping/index.go +++ b/vendor/github.com/blevesearch/bleve/mapping/index.go @@ -320,8 +320,8 @@ func (im *IndexMappingImpl) determineType(data interface{}) string { func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error { docType := im.determineType(data) docMapping := im.mappingForType(docType) - walkContext := im.newWalkContext(doc, docMapping) if docMapping.Enabled { + walkContext := im.newWalkContext(doc, docMapping) docMapping.walkDocument(data, []string{}, []uint64{}, walkContext) // see if the _all field was disabled diff --git a/vendor/github.com/blevesearch/bleve/mapping/reflect.go b/vendor/github.com/blevesearch/bleve/mapping/reflect.go index 3068b1906..6500a7059 100644 --- a/vendor/github.com/blevesearch/bleve/mapping/reflect.go +++ b/vendor/github.com/blevesearch/bleve/mapping/reflect.go @@ -35,6 +35,9 @@ func lookupPropertyPath(data interface{}, path string) interface{} { func lookupPropertyPathPart(data interface{}, part string) interface{} { val := reflect.ValueOf(data) + if !val.IsValid() { + return nil + } typ := val.Type() switch typ.Kind() { case reflect.Map: diff --git a/vendor/github.com/blevesearch/bleve/numeric/bin.go b/vendor/github.com/blevesearch/bleve/numeric/bin.go index cd71392dc..368952a2c 100644 --- a/vendor/github.com/blevesearch/bleve/numeric/bin.go +++ b/vendor/github.com/blevesearch/bleve/numeric/bin.go @@ -14,7 +14,7 @@ var interleaveShift = []uint{1, 2, 4, 8, 16} // Interleave the first 32 bits of each uint64 // apdated from org.apache.lucene.util.BitUtil -// whcih was adapted from: +// which was adapted from: // http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN func Interleave(v1, v2 uint64) uint64 { v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4] diff --git a/vendor/github.com/blevesearch/bleve/numeric/prefix_coded.go b/vendor/github.com/blevesearch/bleve/numeric/prefix_coded.go index 4200c23bb..76ea001ba 100644 --- a/vendor/github.com/blevesearch/bleve/numeric/prefix_coded.go +++ b/vendor/github.com/blevesearch/bleve/numeric/prefix_coded.go @@ -77,6 +77,10 @@ func (p PrefixCoded) Int64() (int64, error) { } func ValidPrefixCodedTerm(p string) (bool, int) { + return ValidPrefixCodedTermBytes([]byte(p)) +} + +func ValidPrefixCodedTermBytes(p []byte) (bool, int) { if len(p) > 0 { if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 { return false, 0 diff --git a/vendor/github.com/blevesearch/bleve/search.go b/vendor/github.com/blevesearch/bleve/search.go index 46d849c1b..ebd69971e 100644 --- a/vendor/github.com/blevesearch/bleve/search.go +++ b/vendor/github.com/blevesearch/bleve/search.go @@ -17,15 +17,29 @@ package bleve import ( "encoding/json" "fmt" + "reflect" "time" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis/datetime/optional" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/collector" "github.com/blevesearch/bleve/search/query" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeSearchResult int +var reflectStaticSizeSearchStatus int + +func init() { + var sr SearchResult + reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size()) + var ss SearchStatus + reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size()) +} + var cache = registry.NewCache() const defaultDateTimeParser = optional.Name @@ -247,6 +261,7 @@ func (h *HighlightRequest) AddField(field string) { // Explain triggers inclusion of additional search // result score explanations. // Sort describes the desired order for the results to be returned. +// Score controls the kind of scoring performed // // A special field named "*" can be used to return all fields. type SearchRequest struct { @@ -259,6 +274,7 @@ type SearchRequest struct { Explain bool `json:"explain"` Sort search.SortOrder `json:"sort"` IncludeLocations bool `json:"includeLocations"` + Score string `json:"score,omitempty"` } func (r *SearchRequest) Validate() error { @@ -308,6 +324,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { Explain bool `json:"explain"` Sort []json.RawMessage `json:"sort"` IncludeLocations bool `json:"includeLocations"` + Score string `json:"score"` } err := json.Unmarshal(input, &temp) @@ -334,6 +351,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { r.Fields = temp.Fields r.Facets = temp.Facets r.IncludeLocations = temp.IncludeLocations + r.Score = temp.Score r.Query, err = query.ParseQuery(temp.Q) if err != nil { return err @@ -432,6 +450,24 @@ type SearchResult struct { Facets search.FacetResults `json:"facets"` } +func (sr *SearchResult) Size() int { + sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr + + reflectStaticSizeSearchStatus + + for _, entry := range sr.Hits { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + for k, v := range sr.Facets { + sizeInBytes += size.SizeOfString + len(k) + + v.Size() + } + + return sizeInBytes +} + func (sr *SearchResult) String() string { rv := "" if sr.Total > 0 { @@ -488,3 +524,44 @@ func (sr *SearchResult) Merge(other *SearchResult) { sr.Facets.Merge(other.Facets) } + +// MemoryNeededForSearchResult is an exported helper function to determine the RAM +// needed to accommodate the results for a given search request. +func MemoryNeededForSearchResult(req *SearchRequest) uint64 { + if req == nil { + return 0 + } + + numDocMatches := req.Size + req.From + if req.Size+req.From > collector.PreAllocSizeSkipCap { + numDocMatches = collector.PreAllocSizeSkipCap + } + + estimate := 0 + + // overhead from the SearchResult structure + var sr SearchResult + estimate += sr.Size() + + var dm search.DocumentMatch + sizeOfDocumentMatch := dm.Size() + + // overhead from results + estimate += numDocMatches * sizeOfDocumentMatch + + // overhead from facet results + if req.Facets != nil { + var fr search.FacetResult + estimate += len(req.Facets) * fr.Size() + } + + // highlighting, store + var d document.Document + if len(req.Fields) > 0 || req.Highlight != nil { + for i := 0; i < (req.Size + req.From); i++ { + estimate += (req.Size + req.From) * d.Size() + } + } + + return uint64(estimate) +} diff --git a/vendor/github.com/blevesearch/bleve/search/collector.go b/vendor/github.com/blevesearch/bleve/search/collector.go index 0d163a9d9..df3ff9c5a 100644 --- a/vendor/github.com/blevesearch/bleve/search/collector.go +++ b/vendor/github.com/blevesearch/bleve/search/collector.go @@ -30,3 +30,23 @@ type Collector interface { SetFacetsBuilder(facetsBuilder *FacetsBuilder) FacetResults() FacetResults } + +// DocumentMatchHandler is the type of document match callback +// bleve will invoke during the search. +// Eventually, bleve will indicate the completion of an ongoing search, +// by passing a nil value for the document match callback. +// The application should take a copy of the hit/documentMatch +// if it wish to own it or need prolonged access to it. +type DocumentMatchHandler func(hit *DocumentMatch) error + +type MakeDocumentMatchHandlerKeyType string + +var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType( + "MakeDocumentMatchHandlerKey") + +// MakeDocumentMatchHandler is an optional DocumentMatchHandler +// builder function which the applications can pass to bleve. +// These builder methods gives a DocumentMatchHandler function +// to bleve, which it will invoke on every document matches. +type MakeDocumentMatchHandler func(ctx *SearchContext) ( + callback DocumentMatchHandler, loadID bool, err error) diff --git a/vendor/github.com/blevesearch/bleve/search/collector/heap.go b/vendor/github.com/blevesearch/bleve/search/collector/heap.go index bdf72eade..05502d5df 100644 --- a/vendor/github.com/blevesearch/bleve/search/collector/heap.go +++ b/vendor/github.com/blevesearch/bleve/search/collector/heap.go @@ -25,9 +25,9 @@ type collectStoreHeap struct { compare collectorCompare } -func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap { +func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap { rv := &collectStoreHeap{ - heap: make(search.DocumentMatchCollection, 0, cap), + heap: make(search.DocumentMatchCollection, 0, capacity), compare: compare, } heap.Init(rv) diff --git a/vendor/github.com/blevesearch/bleve/search/collector/list.go b/vendor/github.com/blevesearch/bleve/search/collector/list.go index ec2f69cb8..f01d205c9 100644 --- a/vendor/github.com/blevesearch/bleve/search/collector/list.go +++ b/vendor/github.com/blevesearch/bleve/search/collector/list.go @@ -25,7 +25,7 @@ type collectStoreList struct { compare collectorCompare } -func newStoreList(cap int, compare collectorCompare) *collectStoreList { +func newStoreList(capacity int, compare collectorCompare) *collectStoreList { rv := &collectStoreList{ results: list.New(), compare: compare, @@ -34,8 +34,7 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList { return rv } -func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, - size int) *search.DocumentMatch { +func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch { c.add(doc) if c.len() > size { return c.removeLast() diff --git a/vendor/github.com/blevesearch/bleve/search/collector/slice.go b/vendor/github.com/blevesearch/bleve/search/collector/slice.go index 32cb86244..85fe73c40 100644 --- a/vendor/github.com/blevesearch/bleve/search/collector/slice.go +++ b/vendor/github.com/blevesearch/bleve/search/collector/slice.go @@ -21,9 +21,9 @@ type collectStoreSlice struct { compare collectorCompare } -func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice { +func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice { rv := &collectStoreSlice{ - slice: make(search.DocumentMatchCollection, 0, cap), + slice: make(search.DocumentMatchCollection, 0, capacity), compare: compare, } return rv diff --git a/vendor/github.com/blevesearch/bleve/search/collector/topn.go b/vendor/github.com/blevesearch/bleve/search/collector/topn.go index 388370e7e..378a7b114 100644 --- a/vendor/github.com/blevesearch/bleve/search/collector/topn.go +++ b/vendor/github.com/blevesearch/bleve/search/collector/topn.go @@ -16,12 +16,21 @@ package collector import ( "context" + "reflect" "time" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTopNCollector int + +func init() { + var coll TopNCollector + reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size()) +} + type collectorStore interface { // Add the document, and if the new store size exceeds the provided size // the last element is removed and returned. If the size has not been @@ -58,6 +67,8 @@ type TopNCollector struct { cachedDesc []bool lowestMatchOutsideResults *search.DocumentMatch + updateFieldVisitor index.DocumentFieldTermVisitor + dvReader index.DocValueReader } // CheckDoneEvery controls how frequently we check the context deadline @@ -98,6 +109,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector return hc } +func (hc *TopNCollector) Size() int { + sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr + + if hc.facetsBuilder != nil { + sizeInBytes += hc.facetsBuilder.Size() + } + + for _, entry := range hc.neededFields { + sizeInBytes += len(entry) + size.SizeOfString + } + + sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc) + + return sizeInBytes +} + // Collect goes to the index to find the matching documents func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { startTime := time.Now() @@ -113,8 +140,34 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, } searchContext := &search.SearchContext{ DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), + Collector: hc, } + hc.dvReader, err = reader.DocValueReader(hc.neededFields) + if err != nil { + return err + } + + hc.updateFieldVisitor = func(field string, term []byte) { + if hc.facetsBuilder != nil { + hc.facetsBuilder.UpdateVisitor(field, term) + } + hc.sort.UpdateVisitor(field, term) + } + + dmHandlerMaker := MakeTopNDocumentMatchHandler + if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil { + dmHandlerMaker = cv.(search.MakeDocumentMatchHandler) + } + // use the application given builder for making the custom document match + // handler and perform callbacks/invocations on the newly made handler. + dmHandler, loadID, err := dmHandlerMaker(searchContext) + if err != nil { + return err + } + + hc.needDocIds = hc.needDocIds || loadID + select { case <-ctx.Done(): return ctx.Err() @@ -130,13 +183,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, } } - err = hc.collectSingle(searchContext, reader, next) + err = hc.prepareDocumentMatch(searchContext, reader, next) + if err != nil { + break + } + + err = dmHandler(next) if err != nil { break } next, err = searcher.Next(searchContext) } + + // help finalize/flush the results in case + // of custom document match handlers. + err = dmHandler(nil) + if err != nil { + return err + } + // compute search duration hc.took = time.Since(startTime) if err != nil { @@ -152,8 +218,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, var sortByScoreOpt = []string{"_score"} -func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error { - var err error +func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext, + reader index.IndexReader, d *search.DocumentMatch) (err error) { // visit field terms for features that require it (sort, facets) if len(hc.neededFields) > 0 { @@ -187,33 +253,49 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I hc.sort.Value(d) } - // optimization, we track lowest sorting hit already removed from heap - // with this one comparison, we can avoid all heap operations if - // this hit would have been added and then immediately removed - if hc.lowestMatchOutsideResults != nil { - cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults) - if cmp >= 0 { - // this hit can't possibly be in the result set, so avoid heap ops - ctx.DocumentMatchPool.Put(d) - return nil - } - } + return nil +} - removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip) - if removed != nil { - if hc.lowestMatchOutsideResults == nil { - hc.lowestMatchOutsideResults = removed - } else { - cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults) - if cmp < 0 { - tmp := hc.lowestMatchOutsideResults - hc.lowestMatchOutsideResults = removed - ctx.DocumentMatchPool.Put(tmp) +func MakeTopNDocumentMatchHandler( + ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) { + var hc *TopNCollector + var ok bool + if hc, ok = ctx.Collector.(*TopNCollector); ok { + return func(d *search.DocumentMatch) error { + if d == nil { + return nil + } + // optimization, we track lowest sorting hit already removed from heap + // with this one comparison, we can avoid all heap operations if + // this hit would have been added and then immediately removed + if hc.lowestMatchOutsideResults != nil { + cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, + hc.lowestMatchOutsideResults) + if cmp >= 0 { + // this hit can't possibly be in the result set, so avoid heap ops + ctx.DocumentMatchPool.Put(d) + return nil + } } - } - } - return nil + removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip) + if removed != nil { + if hc.lowestMatchOutsideResults == nil { + hc.lowestMatchOutsideResults = removed + } else { + cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, + removed, hc.lowestMatchOutsideResults) + if cmp < 0 { + tmp := hc.lowestMatchOutsideResults + hc.lowestMatchOutsideResults = removed + ctx.DocumentMatchPool.Put(tmp) + } + } + } + return nil + }, false, nil + } + return nil, false, nil } // visitFieldTerms is responsible for visiting the field terms of the @@ -223,13 +305,7 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc hc.facetsBuilder.StartDoc() } - err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) { - if hc.facetsBuilder != nil { - hc.facetsBuilder.UpdateVisitor(field, term) - } - hc.sort.UpdateVisitor(field, term) - }) - + err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor) if hc.facetsBuilder != nil { hc.facetsBuilder.EndDoc() } @@ -257,6 +333,7 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error { return err } } + doc.Complete(nil) return nil }) @@ -288,5 +365,5 @@ func (hc *TopNCollector) FacetResults() search.FacetResults { if hc.facetsBuilder != nil { return hc.facetsBuilder.Results() } - return search.FacetResults{} + return nil } diff --git a/vendor/github.com/blevesearch/bleve/search/explanation.go b/vendor/github.com/blevesearch/bleve/search/explanation.go index 766367d77..3b81737b5 100644 --- a/vendor/github.com/blevesearch/bleve/search/explanation.go +++ b/vendor/github.com/blevesearch/bleve/search/explanation.go @@ -17,8 +17,18 @@ package search import ( "encoding/json" "fmt" + "reflect" + + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeExplanation int + +func init() { + var e Explanation + reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) +} + type Explanation struct { Value float64 `json:"value"` Message string `json:"message"` @@ -32,3 +42,14 @@ func (expl *Explanation) String() string { } return string(js) } + +func (expl *Explanation) Size() int { + sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr + + len(expl.Message) + + for _, entry := range expl.Children { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} diff --git a/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_datetime.go b/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_datetime.go index 8657a553a..c45442e4d 100644 --- a/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_datetime.go +++ b/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_datetime.go @@ -15,13 +15,25 @@ package facet import ( + "reflect" "sort" "time" "github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDateTimeFacetBuilder int +var reflectStaticSizedateTimeRange int + +func init() { + var dtfb DateTimeFacetBuilder + reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size()) + var dtr dateTimeRange + reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size()) +} + type dateTimeRange struct { start time.Time end time.Time @@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder { } } +func (fb *DateTimeFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + for k, _ := range fb.ranges { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + reflectStaticSizedateTimeRange + } + + return sizeInBytes +} + func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) { r := dateTimeRange{ start: start, diff --git a/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_numeric.go b/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_numeric.go index 2ab5f2789..c1692b549 100644 --- a/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_numeric.go +++ b/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_numeric.go @@ -15,12 +15,24 @@ package facet import ( + "reflect" "sort" "github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeNumericFacetBuilder int +var reflectStaticSizenumericRange int + +func init() { + var nfb NumericFacetBuilder + reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size()) + var nr numericRange + reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size()) +} + type numericRange struct { min *float64 max *float64 @@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder { } } +func (fb *NumericFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + for k, _ := range fb.ranges { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + reflectStaticSizenumericRange + } + + return sizeInBytes +} + func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) { r := numericRange{ min: min, diff --git a/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_terms.go b/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_terms.go index a41e475a9..5b5901e01 100644 --- a/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_terms.go +++ b/vendor/github.com/blevesearch/bleve/search/facet/facet_builder_terms.go @@ -15,11 +15,20 @@ package facet import ( + "reflect" "sort" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermsFacetBuilder int + +func init() { + var tfb TermsFacetBuilder + reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size()) +} + type TermsFacetBuilder struct { size int field string @@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder { } } +func (fb *TermsFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + return sizeInBytes +} + func (fb *TermsFacetBuilder) Field() string { return fb.field } diff --git a/vendor/github.com/blevesearch/bleve/search/facets_builder.go b/vendor/github.com/blevesearch/bleve/search/facets_builder.go index 05e270413..7fc0bedf3 100644 --- a/vendor/github.com/blevesearch/bleve/search/facets_builder.go +++ b/vendor/github.com/blevesearch/bleve/search/facets_builder.go @@ -15,11 +15,32 @@ package search import ( + "reflect" "sort" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeFacetsBuilder int +var reflectStaticSizeFacetResult int +var reflectStaticSizeTermFacet int +var reflectStaticSizeNumericRangeFacet int +var reflectStaticSizeDateRangeFacet int + +func init() { + var fb FacetsBuilder + reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size()) + var fr FacetResult + reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size()) + var tf TermFacet + reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size()) + var nrf NumericRangeFacet + reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size()) + var drf DateRangeFacet + reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size()) +} + type FacetBuilder interface { StartDoc() UpdateVisitor(field string, term []byte) @@ -27,23 +48,40 @@ type FacetBuilder interface { Result() *FacetResult Field() string + + Size() int } type FacetsBuilder struct { indexReader index.IndexReader - facets map[string]FacetBuilder + facetNames []string + facets []FacetBuilder fields []string } func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder { return &FacetsBuilder{ indexReader: indexReader, - facets: make(map[string]FacetBuilder, 0), } } +func (fb *FacetsBuilder) Size() int { + sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr + + for k, v := range fb.facets { + sizeInBytes += size.SizeOfString + v.Size() + len(fb.facetNames[k]) + } + + for _, entry := range fb.fields { + sizeInBytes += size.SizeOfString + len(entry) + } + + return sizeInBytes +} + func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) { - fb.facets[name] = facetBuilder + fb.facetNames = append(fb.facetNames, name) + fb.facets = append(fb.facets, facetBuilder) fb.fields = append(fb.fields, facetBuilder.Field()) } @@ -213,6 +251,14 @@ type FacetResult struct { DateRanges DateRangeFacets `json:"date_ranges,omitempty"` } +func (fr *FacetResult) Size() int { + return reflectStaticSizeFacetResult + size.SizeOfPtr + + len(fr.Field) + + len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) + + len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) + + len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr) +} + func (fr *FacetResult) Merge(other *FacetResult) { fr.Total += other.Total fr.Missing += other.Missing @@ -287,9 +333,9 @@ func (fr FacetResults) Fixup(name string, size int) { func (fb *FacetsBuilder) Results() FacetResults { fr := make(FacetResults) - for facetName, facetBuilder := range fb.facets { + for i, facetBuilder := range fb.facets { facetResult := facetBuilder.Result() - fr[facetName] = facetResult + fr[fb.facetNames[i]] = facetResult } return fr } diff --git a/vendor/github.com/blevesearch/bleve/search/levenshtein.go b/vendor/github.com/blevesearch/bleve/search/levenshtein.go index ec033143a..687608d3f 100644 --- a/vendor/github.com/blevesearch/bleve/search/levenshtein.go +++ b/vendor/github.com/blevesearch/bleve/search/levenshtein.go @@ -57,15 +57,24 @@ func LevenshteinDistance(a, b string) int { // in which case the first return val will be the max // and the second will be true, indicating max was exceeded func LevenshteinDistanceMax(a, b string, max int) (int, bool) { + v, wasMax, _ := LevenshteinDistanceMaxReuseSlice(a, b, max, nil) + return v, wasMax +} + +func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, []int) { la := len(a) lb := len(b) ld := int(math.Abs(float64(la - lb))) if ld > max { - return max, true + return max, true, d } - d := make([]int, la+1) + if cap(d) < la+1 { + d = make([]int, la+1) + } + d = d[:la+1] + var lastdiag, olddiag, temp int for i := 1; i <= la; i++ { @@ -98,8 +107,8 @@ func LevenshteinDistanceMax(a, b string, max int) (int, bool) { } // after each row if rowmin isn't less than max stop if rowmin > max { - return max, true + return max, true, d } } - return d[la], false + return d[la], false, d } diff --git a/vendor/github.com/blevesearch/bleve/search/pool.go b/vendor/github.com/blevesearch/bleve/search/pool.go index b9b52a613..ba8be8fc2 100644 --- a/vendor/github.com/blevesearch/bleve/search/pool.go +++ b/vendor/github.com/blevesearch/bleve/search/pool.go @@ -14,6 +14,17 @@ package search +import ( + "reflect" +) + +var reflectStaticSizeDocumentMatchPool int + +func init() { + var dmp DocumentMatchPool + reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size()) +} + // DocumentMatchPoolTooSmall is a callback function that can be executed // when the DocumentMatchPool does not have sufficient capacity // By default we just perform just-in-time allocation, but you could log diff --git a/vendor/github.com/blevesearch/bleve/search/query/conjunction.go b/vendor/github.com/blevesearch/bleve/search/query/conjunction.go index 39cc312de..1a7ed1bc0 100644 --- a/vendor/github.com/blevesearch/bleve/search/query/conjunction.go +++ b/vendor/github.com/blevesearch/bleve/search/query/conjunction.go @@ -70,9 +70,11 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, } ss = append(ss, sr) } + if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) } + return searcher.NewConjunctionSearcher(i, ss, options) } diff --git a/vendor/github.com/blevesearch/bleve/search/query/disjunction.go b/vendor/github.com/blevesearch/bleve/search/query/disjunction.go index dacc3a75b..2bc1d7044 100644 --- a/vendor/github.com/blevesearch/bleve/search/query/disjunction.go +++ b/vendor/github.com/blevesearch/bleve/search/query/disjunction.go @@ -58,7 +58,8 @@ func (q *DisjunctionQuery) SetMin(m float64) { q.Min = m } -func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { +func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, + options search.SearcherOptions) (search.Searcher, error) { ss := make([]search.Searcher, 0, len(q.Disjuncts)) for _, disjunct := range q.Disjuncts { sr, err := disjunct.Searcher(i, m, options) @@ -76,9 +77,17 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, } ss = append(ss, sr) } + if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) + } else if len(ss) == 1 && int(q.Min) == ss[0].Min() { + // apply optimization only if both conditions below are satisfied: + // - disjunction searcher has only 1 child searcher + // - parent searcher's min setting is equal to child searcher's min + + return ss[0], nil } + return searcher.NewDisjunctionSearcher(i, ss, q.Min, options) } diff --git a/vendor/github.com/blevesearch/bleve/search/query/query.go b/vendor/github.com/blevesearch/bleve/search/query/query.go index 1b0d94c01..c7c1eefb8 100644 --- a/vendor/github.com/blevesearch/bleve/search/query/query.go +++ b/vendor/github.com/blevesearch/bleve/search/query/query.go @@ -296,32 +296,28 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { } expand = func(query Query) (Query, error) { - switch query.(type) { + switch q := query.(type) { case *QueryStringQuery: - q := query.(*QueryStringQuery) parsed, err := parseQuerySyntax(q.Query) if err != nil { return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err) } return expand(parsed) case *ConjunctionQuery: - q := *query.(*ConjunctionQuery) children, err := expandSlice(q.Conjuncts) if err != nil { return nil, err } q.Conjuncts = children - return &q, nil + return q, nil case *DisjunctionQuery: - q := *query.(*DisjunctionQuery) children, err := expandSlice(q.Disjuncts) if err != nil { return nil, err } q.Disjuncts = children - return &q, nil + return q, nil case *BooleanQuery: - q := *query.(*BooleanQuery) var err error q.Must, err = expand(q.Must) if err != nil { @@ -335,7 +331,7 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { if err != nil { return nil, err } - return &q, nil + return q, nil default: return query, nil } diff --git a/vendor/github.com/blevesearch/bleve/search/query/query_string_lex.go b/vendor/github.com/blevesearch/bleve/search/query/query_string_lex.go index 9c59cedde..3a9cf2398 100644 --- a/vendor/github.com/blevesearch/bleve/search/query/query_string_lex.go +++ b/vendor/github.com/blevesearch/bleve/search/query/query_string_lex.go @@ -273,6 +273,7 @@ func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) { // see where to go if !l.seenDot && next == '.' { // stay in this state + l.seenDot = true l.buf += string(next) return inNumOrStrState, true } else if unicode.IsDigit(next) { diff --git a/vendor/github.com/blevesearch/bleve/search/query/regexp.go b/vendor/github.com/blevesearch/bleve/search/query/regexp.go index 09544fcf1..0c87a6f92 100644 --- a/vendor/github.com/blevesearch/bleve/search/query/regexp.go +++ b/vendor/github.com/blevesearch/bleve/search/query/regexp.go @@ -15,7 +15,6 @@ package query import ( - "regexp" "strings" "github.com/blevesearch/bleve/index" @@ -28,7 +27,6 @@ type RegexpQuery struct { Regexp string `json:"regexp"` FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` - compiled *regexp.Regexp } // NewRegexpQuery creates a new Query which finds @@ -64,33 +62,20 @@ func (q *RegexpQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, opti if q.FieldVal == "" { field = m.DefaultSearchField() } - err := q.compile() - if err != nil { - return nil, err + + // require that pattern NOT be anchored to start and end of term. + // do not attempt to remove trailing $, its presence is not + // known to interfere with LiteralPrefix() the way ^ does + // and removing $ introduces possible ambiguities with escaped \$, \\$, etc + actualRegexp := q.Regexp + if strings.HasPrefix(actualRegexp, "^") { + actualRegexp = actualRegexp[1:] // remove leading ^ } - return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) + return searcher.NewRegexpStringSearcher(i, actualRegexp, field, + q.BoostVal.Value(), options) } func (q *RegexpQuery) Validate() error { - return q.compile() -} - -func (q *RegexpQuery) compile() error { - if q.compiled == nil { - // require that pattern NOT be anchored to start and end of term - actualRegexp := q.Regexp - if strings.HasPrefix(actualRegexp, "^") { - actualRegexp = actualRegexp[1:] // remove leading ^ - } - // do not attempt to remove trailing $, it's presence is not - // known to interfere with LiteralPrefix() the way ^ does - // and removing $ introduces possible ambiguities with escaped \$, \\$, etc - var err error - q.compiled, err = regexp.Compile(actualRegexp) - if err != nil { - return err - } - } - return nil + return nil // real validation delayed until searcher constructor } diff --git a/vendor/github.com/blevesearch/bleve/search/query/wildcard.go b/vendor/github.com/blevesearch/bleve/search/query/wildcard.go index 7fd7482c4..747dfe76f 100644 --- a/vendor/github.com/blevesearch/bleve/search/query/wildcard.go +++ b/vendor/github.com/blevesearch/bleve/search/query/wildcard.go @@ -15,7 +15,6 @@ package query import ( - "regexp" "strings" "github.com/blevesearch/bleve/index" @@ -47,7 +46,6 @@ type WildcardQuery struct { Wildcard string `json:"wildcard"` FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` - compiled *regexp.Regexp } // NewWildcardQuery creates a new Query which finds @@ -83,24 +81,13 @@ func (q *WildcardQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, op if q.FieldVal == "" { field = m.DefaultSearchField() } - if q.compiled == nil { - var err error - q.compiled, err = q.convertToRegexp() - if err != nil { - return nil, err - } - } - return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) -} + regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) -func (q *WildcardQuery) Validate() error { - var err error - q.compiled, err = q.convertToRegexp() - return err + return searcher.NewRegexpStringSearcher(i, regexpString, field, + q.BoostVal.Value(), options) } -func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) { - regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) - return regexp.Compile(regexpString) +func (q *WildcardQuery) Validate() error { + return nil // real validation delayed until searcher constructor } diff --git a/vendor/github.com/blevesearch/bleve/search/scorer/scorer_conjunction.go b/vendor/github.com/blevesearch/bleve/search/scorer/scorer_conjunction.go index aad6f9c16..48cdf3ae9 100644 --- a/vendor/github.com/blevesearch/bleve/search/scorer/scorer_conjunction.go +++ b/vendor/github.com/blevesearch/bleve/search/scorer/scorer_conjunction.go @@ -15,13 +15,27 @@ package scorer import ( + "reflect" + "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConjunctionQueryScorer int + +func init() { + var cqs ConjunctionQueryScorer + reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size()) +} + type ConjunctionQueryScorer struct { options search.SearcherOptions } +func (s *ConjunctionQueryScorer) Size() int { + return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr +} + func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer { return &ConjunctionQueryScorer{ options: options, @@ -35,15 +49,11 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ childrenExplanations = make([]*search.Explanation, len(constituents)) } - locations := []search.FieldTermLocationMap{} for i, docMatch := range constituents { sum += docMatch.Score if s.options.Explain { childrenExplanations[i] = docMatch.Expl } - if docMatch.Locations != nil { - locations = append(locations, docMatch.Locations) - } } newScore := sum var newExpl *search.Explanation @@ -55,11 +65,8 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ rv := constituents[0] rv.Score = newScore rv.Expl = newExpl - if len(locations) == 1 { - rv.Locations = locations[0] - } else if len(locations) > 1 { - rv.Locations = search.MergeLocations(locations) - } + rv.FieldTermLocations = search.MergeFieldTermLocations( + rv.FieldTermLocations, constituents[1:]) return rv } diff --git a/vendor/github.com/blevesearch/bleve/search/scorer/scorer_constant.go b/vendor/github.com/blevesearch/bleve/search/scorer/scorer_constant.go index a65a826f2..dc10fdaa4 100644 --- a/vendor/github.com/blevesearch/bleve/search/scorer/scorer_constant.go +++ b/vendor/github.com/blevesearch/bleve/search/scorer/scorer_constant.go @@ -16,11 +16,20 @@ package scorer import ( "fmt" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConstantScorer int + +func init() { + var cs ConstantScorer + reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size()) +} + type ConstantScorer struct { constant float64 boost float64 @@ -30,6 +39,16 @@ type ConstantScorer struct { queryWeightExplanation *search.Explanation } +func (s *ConstantScorer) Size() int { + sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr + + if s.queryWeightExplanation != nil { + sizeInBytes += s.queryWeightExplanation.Size() + } + + return sizeInBytes +} + func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer { rv := ConstantScorer{ options: options, diff --git a/vendor/github.com/blevesearch/bleve/search/scorer/scorer_disjunction.go b/vendor/github.com/blevesearch/bleve/search/scorer/scorer_disjunction.go index 184a15d27..7a955e168 100644 --- a/vendor/github.com/blevesearch/bleve/search/scorer/scorer_disjunction.go +++ b/vendor/github.com/blevesearch/bleve/search/scorer/scorer_disjunction.go @@ -16,14 +16,27 @@ package scorer import ( "fmt" + "reflect" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDisjunctionQueryScorer int + +func init() { + var dqs DisjunctionQueryScorer + reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size()) +} + type DisjunctionQueryScorer struct { options search.SearcherOptions } +func (s *DisjunctionQueryScorer) Size() int { + return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr +} + func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer { return &DisjunctionQueryScorer{ options: options, @@ -37,15 +50,11 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ childrenExplanations = make([]*search.Explanation, len(constituents)) } - var locations []search.FieldTermLocationMap for i, docMatch := range constituents { sum += docMatch.Score if s.options.Explain { childrenExplanations[i] = docMatch.Expl } - if docMatch.Locations != nil { - locations = append(locations, docMatch.Locations) - } } var rawExpl *search.Explanation @@ -67,11 +76,8 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ rv := constituents[0] rv.Score = newScore rv.Expl = newExpl - if len(locations) == 1 { - rv.Locations = locations[0] - } else if len(locations) > 1 { - rv.Locations = search.MergeLocations(locations) - } + rv.FieldTermLocations = search.MergeFieldTermLocations( + rv.FieldTermLocations, constituents[1:]) return rv } diff --git a/vendor/github.com/blevesearch/bleve/search/scorer/scorer_term.go b/vendor/github.com/blevesearch/bleve/search/scorer/scorer_term.go index b5f46322c..5544f2d01 100644 --- a/vendor/github.com/blevesearch/bleve/search/scorer/scorer_term.go +++ b/vendor/github.com/blevesearch/bleve/search/scorer/scorer_term.go @@ -17,13 +17,22 @@ package scorer import ( "fmt" "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermQueryScorer int + +func init() { + var tqs TermQueryScorer + reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size()) +} + type TermQueryScorer struct { - queryTerm []byte + queryTerm string queryField string queryBoost float64 docTerm uint64 @@ -36,9 +45,24 @@ type TermQueryScorer struct { queryWeightExplanation *search.Explanation } +func (s *TermQueryScorer) Size() int { + sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr + + len(s.queryTerm) + len(s.queryField) + + if s.idfExplanation != nil { + sizeInBytes += s.idfExplanation.Size() + } + + if s.queryWeightExplanation != nil { + sizeInBytes += s.queryWeightExplanation.Size() + } + + return sizeInBytes +} + func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { rv := TermQueryScorer{ - queryTerm: queryTerm, + queryTerm: string(queryTerm), queryField: queryField, queryBoost: queryBoost, docTerm: docTerm, @@ -82,7 +106,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { } s.queryWeightExplanation = &search.Explanation{ Value: s.queryWeight, - Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, string(s.queryTerm), s.queryBoost), + Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost), Children: childrenExplanations, } } @@ -104,7 +128,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term childrenExplanations := make([]*search.Explanation, 3) childrenExplanations[0] = &search.Explanation{ Value: tf, - Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, string(s.queryTerm), termMatch.Freq), + Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), } childrenExplanations[1] = &search.Explanation{ Value: termMatch.Norm, @@ -113,7 +137,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term childrenExplanations[2] = s.idfExplanation scoreExplanation = &search.Explanation{ Value: score, - Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, string(s.queryTerm), termMatch.ID), + Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID), Children: childrenExplanations, } } @@ -127,7 +151,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term childExplanations[1] = scoreExplanation scoreExplanation = &search.Explanation{ Value: score, - Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, string(s.queryTerm), s.queryBoost, termMatch.ID), + Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID), Children: childExplanations, } } @@ -140,41 +164,31 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term rv.Expl = scoreExplanation } - if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 { - locs := make([]search.Location, len(termMatch.Vectors)) - locsUsed := 0 - - totalPositions := 0 - for _, v := range termMatch.Vectors { - totalPositions += len(v.ArrayPositions) + if len(termMatch.Vectors) > 0 { + if cap(rv.FieldTermLocations) < len(termMatch.Vectors) { + rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors)) } - positions := make(search.ArrayPositions, totalPositions) - positionsUsed := 0 - rv.Locations = make(search.FieldTermLocationMap) for _, v := range termMatch.Vectors { - tlm := rv.Locations[v.Field] - if tlm == nil { - tlm = make(search.TermLocationMap) - rv.Locations[v.Field] = tlm - } - - loc := &locs[locsUsed] - locsUsed++ - - loc.Pos = v.Pos - loc.Start = v.Start - loc.End = v.End - + var ap search.ArrayPositions if len(v.ArrayPositions) > 0 { - loc.ArrayPositions = positions[positionsUsed : positionsUsed+len(v.ArrayPositions)] - for i, ap := range v.ArrayPositions { - loc.ArrayPositions[i] = ap + n := len(rv.FieldTermLocations) + if n < cap(rv.FieldTermLocations) { // reuse ap slice if available + ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0] } - positionsUsed += len(v.ArrayPositions) + ap = append(ap, v.ArrayPositions...) } - - tlm[string(s.queryTerm)] = append(tlm[string(s.queryTerm)], loc) + rv.FieldTermLocations = + append(rv.FieldTermLocations, search.FieldTermLocation{ + Field: v.Field, + Term: s.queryTerm, + Location: search.Location{ + Pos: v.Pos, + Start: v.Start, + End: v.End, + ArrayPositions: ap, + }, + }) } } diff --git a/vendor/github.com/blevesearch/bleve/search/search.go b/vendor/github.com/blevesearch/bleve/search/search.go index f9a92783b..f8a282d16 100644 --- a/vendor/github.com/blevesearch/bleve/search/search.go +++ b/vendor/github.com/blevesearch/bleve/search/search.go @@ -16,11 +16,25 @@ package search import ( "fmt" + "reflect" - "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDocumentMatch int +var reflectStaticSizeSearchContext int +var reflectStaticSizeLocation int + +func init() { + var dm DocumentMatch + reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size()) + var sc SearchContext + reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + type ArrayPositions []uint64 func (ap ArrayPositions) Equals(other ArrayPositions) bool { @@ -47,6 +61,11 @@ type Location struct { ArrayPositions ArrayPositions `json:"array_positions"` } +func (l *Location) Size() int { + return reflectStaticSizeLocation + size.SizeOfPtr + + len(l.ArrayPositions)*size.SizeOfUint64 +} + type Locations []*Location type TermLocationMap map[string]Locations @@ -57,6 +76,12 @@ func (t TermLocationMap) AddLocation(term string, location *Location) { type FieldTermLocationMap map[string]TermLocationMap +type FieldTermLocation struct { + Field string + Term string + Location Location +} + type FieldFragmentMap map[string][]string type DocumentMatch struct { @@ -74,11 +99,14 @@ type DocumentMatch struct { // fields as float64s and date fields as time.RFC3339 formatted strings. Fields map[string]interface{} `json:"fields,omitempty"` - // if we load the document for this hit, remember it so we dont load again - Document *document.Document `json:"-"` - // used to maintain natural index order HitNumber uint64 `json:"-"` + + // used to temporarily hold field term location information during + // search processing in an efficient, recycle-friendly manner, to + // be later incorporated into the Locations map when search + // results are completed + FieldTermLocations []FieldTermLocation `json:"-"` } func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { @@ -108,15 +136,116 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { indexInternalID := dm.IndexInternalID // remember the []interface{} used for sort sort := dm.Sort + // remember the FieldTermLocations backing array + ftls := dm.FieldTermLocations + for i := range ftls { // recycle the ArrayPositions of each location + ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] + } // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) dm.IndexInternalID = indexInternalID[:0] // reuse the []interface{} already allocated (and reset len to 0) dm.Sort = sort[:0] + // reuse the FieldTermLocations already allocated (and reset len to 0) + dm.FieldTermLocations = ftls[:0] return dm } +func (dm *DocumentMatch) Size() int { + sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr + + len(dm.Index) + + len(dm.ID) + + len(dm.IndexInternalID) + + if dm.Expl != nil { + sizeInBytes += dm.Expl.Size() + } + + for k, v := range dm.Locations { + sizeInBytes += size.SizeOfString + len(k) + for k1, v1 := range v { + sizeInBytes += size.SizeOfString + len(k1) + + size.SizeOfSlice + for _, entry := range v1 { + sizeInBytes += entry.Size() + } + } + } + + for k, v := range dm.Fragments { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfSlice + + for _, entry := range v { + sizeInBytes += size.SizeOfString + len(entry) + } + } + + for _, entry := range dm.Sort { + sizeInBytes += size.SizeOfString + len(entry) + } + + for k, _ := range dm.Fields { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + } + + return sizeInBytes +} + +// Complete performs final preparation & transformation of the +// DocumentMatch at the end of search processing, also allowing the +// caller to provide an optional preallocated locations slice +func (dm *DocumentMatch) Complete(prealloc []Location) []Location { + // transform the FieldTermLocations slice into the Locations map + nlocs := len(dm.FieldTermLocations) + if nlocs > 0 { + if cap(prealloc) < nlocs { + prealloc = make([]Location, nlocs) + } + prealloc = prealloc[:nlocs] + + var lastField string + var tlm TermLocationMap + + for i, ftl := range dm.FieldTermLocations { + if lastField != ftl.Field { + lastField = ftl.Field + + if dm.Locations == nil { + dm.Locations = make(FieldTermLocationMap) + } + + tlm = dm.Locations[ftl.Field] + if tlm == nil { + tlm = make(TermLocationMap) + dm.Locations[ftl.Field] = tlm + } + } + + loc := &prealloc[i] + *loc = ftl.Location + + if len(loc.ArrayPositions) > 0 { // copy + loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...) + } + + tlm[ftl.Term] = append(tlm[ftl.Term], loc) + + dm.FieldTermLocations[i] = FieldTermLocation{ // recycle + Location: Location{ + ArrayPositions: ftl.Location.ArrayPositions[:0], + }, + } + } + } + + dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle + + return prealloc +} + func (dm *DocumentMatch) String() string { return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score) } @@ -135,6 +264,7 @@ type Searcher interface { SetQueryNorm(float64) Count() uint64 Min() int + Size() int DocumentMatchPoolSize() int } @@ -142,9 +272,26 @@ type Searcher interface { type SearcherOptions struct { Explain bool IncludeTermVectors bool + Score string } // SearchContext represents the context around a single search type SearchContext struct { DocumentMatchPool *DocumentMatchPool + Collector Collector +} + +func (sc *SearchContext) Size() int { + sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr + + reflectStaticSizeDocumentMatchPool + size.SizeOfPtr + + if sc.DocumentMatchPool != nil { + for _, entry := range sc.DocumentMatchPool.avail { + if entry != nil { + sizeInBytes += entry.Size() + } + } + } + + return sizeInBytes } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_boolean.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_boolean.go index a905c29e5..bbbced479 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_boolean.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_boolean.go @@ -16,12 +16,21 @@ package searcher import ( "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeBooleanSearcher int + +func init() { + var bs BooleanSearcher + reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size()) +} + type BooleanSearcher struct { indexReader index.IndexReader mustSearcher search.Searcher @@ -52,6 +61,32 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc return &rv, nil } +func (s *BooleanSearcher) Size() int { + sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr + + if s.mustSearcher != nil { + sizeInBytes += s.mustSearcher.Size() + } + + if s.shouldSearcher != nil { + sizeInBytes += s.shouldSearcher.Size() + } + + if s.mustNotSearcher != nil { + sizeInBytes += s.mustNotSearcher.Size() + } + + sizeInBytes += s.scorer.Size() + + for _, entry := range s.matches { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + func (s *BooleanSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 @@ -284,6 +319,7 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch return nil, err } } + return rv, nil } @@ -296,41 +332,52 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } } - var err error - if s.mustSearcher != nil { - if s.currMust != nil { - ctx.DocumentMatchPool.Put(s.currMust) - } - s.currMust, err = s.mustSearcher.Advance(ctx, ID) - if err != nil { - return nil, err - } - } - if s.shouldSearcher != nil { - if s.currShould != nil { - ctx.DocumentMatchPool.Put(s.currShould) - } - s.currShould, err = s.shouldSearcher.Advance(ctx, ID) - if err != nil { - return nil, err + // Advance the searchers only if the currentID cursor is trailing the lookup ID, + // additionally if the mustNotSearcher has been initialized, ensure that the + // cursor used to track the mustNotSearcher (currMustNot, which isn't tracked by + // currentID) is trailing the lookup ID as well - for in the case where currentID + // is nil and currMustNot is already at or ahead of the lookup ID, we MUST NOT + // advance the currentID or the currMustNot cursors. + if (s.currentID == nil || s.currentID.Compare(ID) < 0) && + (s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0) { + var err error + if s.mustSearcher != nil { + if s.currMust != nil { + ctx.DocumentMatchPool.Put(s.currMust) + } + s.currMust, err = s.mustSearcher.Advance(ctx, ID) + if err != nil { + return nil, err + } } - } - if s.mustNotSearcher != nil { - if s.currMustNot != nil { - ctx.DocumentMatchPool.Put(s.currMustNot) + + if s.shouldSearcher != nil { + if s.currShould != nil { + ctx.DocumentMatchPool.Put(s.currShould) + } + s.currShould, err = s.shouldSearcher.Advance(ctx, ID) + if err != nil { + return nil, err + } } - s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) - if err != nil { - return nil, err + + if s.mustNotSearcher != nil { + if s.currMustNot != nil { + ctx.DocumentMatchPool.Put(s.currMustNot) + } + s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) + if err != nil { + return nil, err + } } - } - if s.mustSearcher != nil && s.currMust != nil { - s.currentID = s.currMust.IndexInternalID - } else if s.mustSearcher == nil && s.currShould != nil { - s.currentID = s.currShould.IndexInternalID - } else { - s.currentID = nil + if s.mustSearcher != nil && s.currMust != nil { + s.currentID = s.currMust.IndexInternalID + } else if s.mustSearcher == nil && s.currShould != nil { + s.currentID = s.currShould.IndexInternalID + } else { + s.currentID = nil + } } return s.Next(ctx) diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_conjunction.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_conjunction.go index 73fba19cd..ac737bccd 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_conjunction.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_conjunction.go @@ -16,13 +16,22 @@ package searcher import ( "math" + "reflect" "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConjunctionSearcher int + +func init() { + var cs ConjunctionSearcher + reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size()) +} + type ConjunctionSearcher struct { indexReader index.IndexReader searchers OrderedSearcherList @@ -34,14 +43,27 @@ type ConjunctionSearcher struct { options search.SearcherOptions } -func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) (*ConjunctionSearcher, error) { - // build the downstream searchers +func NewConjunctionSearcher(indexReader index.IndexReader, + qsearchers []search.Searcher, options search.SearcherOptions) ( + search.Searcher, error) { + // build the sorted downstream searchers searchers := make(OrderedSearcherList, len(qsearchers)) for i, searcher := range qsearchers { searchers[i] = searcher } - // sort the searchers sort.Sort(searchers) + + // attempt the "unadorned" conjunction optimization only when we + // do not need extra information like freq-norm's or term vectors + if len(searchers) > 1 && + options.Score == "none" && !options.IncludeTermVectors { + rv, err := optimizeCompositeSearcher("conjunction:unadorned", + indexReader, searchers, options) + if err != nil || rv != nil { + return rv, err + } + } + // build our searcher rv := ConjunctionSearcher{ indexReader: indexReader, @@ -51,9 +73,36 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S scorer: scorer.NewConjunctionQueryScorer(options), } rv.computeQueryNorm() + + // attempt push-down conjunction optimization when there's >1 searchers + if len(searchers) > 1 { + rv, err := optimizeCompositeSearcher("conjunction", + indexReader, searchers, options) + if err != nil || rv != nil { + return rv, err + } + } + return &rv, nil } +func (s *ConjunctionSearcher) Size() int { + sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + func (s *ConjunctionSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 @@ -108,7 +157,7 @@ func (s *ConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentM var rv *search.DocumentMatch var err error OUTER: - for s.currs[s.maxIDIdx] != nil { + for s.maxIDIdx < len(s.currs) && s.currs[s.maxIDIdx] != nil { maxID := s.currs[s.maxIDIdx].IndexInternalID i := 0 diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go index b6910ddb6..6a296b68f 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go @@ -1,4 +1,4 @@ -// Copyright (c) 2014 Couchbase, Inc. +// Copyright (c) 2018 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,12 +16,9 @@ package searcher import ( "fmt" - "math" - "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" - "github.com/blevesearch/bleve/search/scorer" ) // DisjunctionMaxClauseCount is a compile time setting that applications can @@ -29,246 +26,84 @@ import ( // error instead of exeucting searches when the size exceeds this value. var DisjunctionMaxClauseCount = 0 -type DisjunctionSearcher struct { - indexReader index.IndexReader - searchers OrderedSearcherList - numSearchers int - queryNorm float64 - currs []*search.DocumentMatch - scorer *scorer.DisjunctionQueryScorer - min int - matching []*search.DocumentMatch - matchingIdxs []int - initialized bool -} - -func tooManyClauses(count int) bool { - if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { - return true - } - return false -} - -func tooManyClausesErr() error { - return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]", - DisjunctionMaxClauseCount) -} +// DisjunctionHeapTakeover is a compile time setting that applications can +// adjust to control when the DisjunctionSearcher will switch from a simple +// slice implementation to a heap implementation. +var DisjunctionHeapTakeover = 10 func NewDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( - *DisjunctionSearcher, error) { - return newDisjunctionSearcher(indexReader, qsearchers, min, options, - true) + search.Searcher, error) { + return newDisjunctionSearcher(indexReader, qsearchers, min, options, true) } func newDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions, - limit bool) ( - *DisjunctionSearcher, error) { - if limit && tooManyClauses(len(qsearchers)) { - return nil, tooManyClausesErr() - } - // build the downstream searchers - searchers := make(OrderedSearcherList, len(qsearchers)) - for i, searcher := range qsearchers { - searchers[i] = searcher - } - // sort the searchers - sort.Sort(sort.Reverse(searchers)) - // build our searcher - rv := DisjunctionSearcher{ - indexReader: indexReader, - searchers: searchers, - numSearchers: len(searchers), - currs: make([]*search.DocumentMatch, len(searchers)), - scorer: scorer.NewDisjunctionQueryScorer(options), - min: int(min), - matching: make([]*search.DocumentMatch, len(searchers)), - matchingIdxs: make([]int, len(searchers)), - } - rv.computeQueryNorm() - return &rv, nil -} - -func (s *DisjunctionSearcher) computeQueryNorm() { - // first calculate sum of squared weights - sumOfSquaredWeights := 0.0 - for _, searcher := range s.searchers { - sumOfSquaredWeights += searcher.Weight() - } - // now compute query norm from this - s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) - // finally tell all the downstream searchers the norm - for _, searcher := range s.searchers { - searcher.SetQueryNorm(s.queryNorm) - } -} - -func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error { - var err error - // get all searchers pointing at their first match - for i, searcher := range s.searchers { - if s.currs[i] != nil { - ctx.DocumentMatchPool.Put(s.currs[i]) - } - s.currs[i], err = searcher.Next(ctx) - if err != nil { - return err + limit bool) (search.Searcher, error) { + // attempt the "unadorned" disjunction optimization only when we + // do not need extra information like freq-norm's or term vectors + // and the requested min is simple + if len(qsearchers) > 1 && min <= 1 && + options.Score == "none" && !options.IncludeTermVectors { + rv, err := optimizeCompositeSearcher("disjunction:unadorned", + indexReader, qsearchers, options) + if err != nil || rv != nil { + return rv, err } } - err = s.updateMatches() - if err != nil { - return err + if len(qsearchers) > DisjunctionHeapTakeover { + return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options, + limit) } - - s.initialized = true - return nil + return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options, + limit) } -func (s *DisjunctionSearcher) updateMatches() error { - matching := s.matching[:0] - matchingIdxs := s.matchingIdxs[:0] - - for i := 0; i < len(s.currs); i++ { - curr := s.currs[i] - if curr == nil { - continue - } - - if len(matching) > 0 { - cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) - if cmp > 0 { - continue - } +func optimizeCompositeSearcher(optimizationKind string, + indexReader index.IndexReader, qsearchers []search.Searcher, + options search.SearcherOptions) (search.Searcher, error) { + var octx index.OptimizableContext - if cmp < 0 { - matching = matching[:0] - matchingIdxs = matchingIdxs[:0] - } + for _, searcher := range qsearchers { + o, ok := searcher.(index.Optimizable) + if !ok { + return nil, nil } - matching = append(matching, curr) - matchingIdxs = append(matchingIdxs, i) - } - - s.matching = matching - s.matchingIdxs = matchingIdxs - - return nil -} - -func (s *DisjunctionSearcher) Weight() float64 { - var rv float64 - for _, searcher := range s.searchers { - rv += searcher.Weight() - } - return rv -} - -func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) { - for _, searcher := range s.searchers { - searcher.SetQueryNorm(qnorm) - } -} - -func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) ( - *search.DocumentMatch, error) { - if !s.initialized { - err := s.initSearchers(ctx) - if err != nil { - return nil, err - } - } - var err error - var rv *search.DocumentMatch - - found := false - for !found && len(s.matching) > 0 { - if len(s.matching) >= s.min { - found = true - // score this match - rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) - } - - // invoke next on all the matching searchers - for _, i := range s.matchingIdxs { - searcher := s.searchers[i] - if s.currs[i] != rv { - ctx.DocumentMatchPool.Put(s.currs[i]) - } - s.currs[i], err = searcher.Next(ctx) - if err != nil { - return nil, err - } - } - - err = s.updateMatches() + var err error + octx, err = o.Optimize(optimizationKind, octx) if err != nil { return nil, err } - } - return rv, nil -} -func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext, - ID index.IndexInternalID) (*search.DocumentMatch, error) { - if !s.initialized { - err := s.initSearchers(ctx) - if err != nil { - return nil, err - } - } - // get all searchers pointing at their first match - var err error - for i, searcher := range s.searchers { - if s.currs[i] != nil { - if s.currs[i].IndexInternalID.Compare(ID) >= 0 { - continue - } - ctx.DocumentMatchPool.Put(s.currs[i]) - } - s.currs[i], err = searcher.Advance(ctx, ID) - if err != nil { - return nil, err + if octx == nil { + return nil, nil } } - err = s.updateMatches() - if err != nil { + optimized, err := octx.Finish() + if err != nil || optimized == nil { return nil, err } - return s.Next(ctx) -} - -func (s *DisjunctionSearcher) Count() uint64 { - // for now return a worst case - var sum uint64 - for _, searcher := range s.searchers { - sum += searcher.Count() + tfr, ok := optimized.(index.TermFieldReader) + if !ok { + return nil, nil } - return sum -} -func (s *DisjunctionSearcher) Close() (rv error) { - for _, searcher := range s.searchers { - err := searcher.Close() - if err != nil && rv == nil { - rv = err - } - } - return rv + return newTermSearcherFromReader(indexReader, tfr, + []byte(optimizationKind), "*", 1.0, options) } -func (s *DisjunctionSearcher) Min() int { - return s.min +func tooManyClauses(count int) bool { + if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { + return true + } + return false } -func (s *DisjunctionSearcher) DocumentMatchPoolSize() int { - rv := len(s.currs) - for _, s := range s.searchers { - rv += s.DocumentMatchPoolSize() - } - return rv +func tooManyClausesErr(count int) error { + return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]", + count, DisjunctionMaxClauseCount) } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go new file mode 100644 index 000000000..ec133f1f8 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go @@ -0,0 +1,343 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "bytes" + "container/heap" + "math" + "reflect" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeDisjunctionHeapSearcher int +var reflectStaticSizeSearcherCurr int + +func init() { + var dhs DisjunctionHeapSearcher + reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size()) + + var sc SearcherCurr + reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size()) +} + +type SearcherCurr struct { + searcher search.Searcher + curr *search.DocumentMatch +} + +type DisjunctionHeapSearcher struct { + indexReader index.IndexReader + + numSearchers int + scorer *scorer.DisjunctionQueryScorer + min int + queryNorm float64 + initialized bool + searchers []search.Searcher + heap []*SearcherCurr + + matching []*search.DocumentMatch + matchingCurrs []*SearcherCurr +} + +func newDisjunctionHeapSearcher(indexReader index.IndexReader, + searchers []search.Searcher, min float64, options search.SearcherOptions, + limit bool) ( + *DisjunctionHeapSearcher, error) { + if limit && tooManyClauses(len(searchers)) { + return nil, tooManyClausesErr(len(searchers)) + } + + // build our searcher + rv := DisjunctionHeapSearcher{ + indexReader: indexReader, + searchers: searchers, + numSearchers: len(searchers), + scorer: scorer.NewDisjunctionQueryScorer(options), + min: int(min), + matching: make([]*search.DocumentMatch, len(searchers)), + matchingCurrs: make([]*SearcherCurr, len(searchers)), + heap: make([]*SearcherCurr, 0, len(searchers)), + } + rv.computeQueryNorm() + return &rv, nil +} + +func (s *DisjunctionHeapSearcher) Size() int { + sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.matching { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + // for matchingCurrs and heap, just use static size * len + // since searchers and document matches already counted above + sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr + sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr + + return sizeInBytes +} + +func (s *DisjunctionHeapSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + +func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error { + // alloc a single block of SearcherCurrs + block := make([]SearcherCurr, len(s.searchers)) + + // get all searchers pointing at their first match + for i, searcher := range s.searchers { + curr, err := searcher.Next(ctx) + if err != nil { + return err + } + if curr != nil { + block[i].searcher = searcher + block[i].curr = curr + heap.Push(s, &block[i]) + } + } + + err := s.updateMatches() + if err != nil { + return err + } + s.initialized = true + return nil +} + +func (s *DisjunctionHeapSearcher) updateMatches() error { + matching := s.matching[:0] + matchingCurrs := s.matchingCurrs[:0] + + if len(s.heap) > 0 { + + // top of the heap is our next hit + next := heap.Pop(s).(*SearcherCurr) + matching = append(matching, next.curr) + matchingCurrs = append(matchingCurrs, next) + + // now as long as top of heap matches, keep popping + for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { + next = heap.Pop(s).(*SearcherCurr) + matching = append(matching, next.curr) + matchingCurrs = append(matchingCurrs, next) + } + } + + s.matching = matching + s.matchingCurrs = matchingCurrs + + return nil +} + +func (s *DisjunctionHeapSearcher) Weight() float64 { + var rv float64 + for _, searcher := range s.searchers { + rv += searcher.Weight() + } + return rv +} + +func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) { + for _, searcher := range s.searchers { + searcher.SetQueryNorm(qnorm) + } +} + +func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) ( + *search.DocumentMatch, error) { + if !s.initialized { + err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + } + + var rv *search.DocumentMatch + found := false + for !found && len(s.matching) > 0 { + if len(s.matching) >= s.min { + found = true + // score this match + rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) + } + + // invoke next on all the matching searchers + for _, matchingCurr := range s.matchingCurrs { + if matchingCurr.curr != rv { + ctx.DocumentMatchPool.Put(matchingCurr.curr) + } + curr, err := matchingCurr.searcher.Next(ctx) + if err != nil { + return nil, err + } + if curr != nil { + matchingCurr.curr = curr + heap.Push(s, matchingCurr) + } + } + + err := s.updateMatches() + if err != nil { + return nil, err + } + } + + return rv, nil +} + +func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, + ID index.IndexInternalID) (*search.DocumentMatch, error) { + if !s.initialized { + err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + } + + // if there is anything in matching, toss it back onto the heap + for _, matchingCurr := range s.matchingCurrs { + heap.Push(s, matchingCurr) + } + s.matching = s.matching[:0] + s.matchingCurrs = s.matchingCurrs[:0] + + // find all searchers that actually need to be advanced + // advance them, using s.matchingCurrs as temp storage + for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { + searcherCurr := heap.Pop(s).(*SearcherCurr) + ctx.DocumentMatchPool.Put(searcherCurr.curr) + curr, err := searcherCurr.searcher.Advance(ctx, ID) + if err != nil { + return nil, err + } + if curr != nil { + searcherCurr.curr = curr + s.matchingCurrs = append(s.matchingCurrs, searcherCurr) + } + } + // now all of the searchers that we advanced have to be pushed back + for _, matchingCurr := range s.matchingCurrs { + heap.Push(s, matchingCurr) + } + // reset our temp space + s.matchingCurrs = s.matchingCurrs[:0] + + err := s.updateMatches() + if err != nil { + return nil, err + } + + return s.Next(ctx) +} + +func (s *DisjunctionHeapSearcher) Count() uint64 { + // for now return a worst case + var sum uint64 + for _, searcher := range s.searchers { + sum += searcher.Count() + } + return sum +} + +func (s *DisjunctionHeapSearcher) Close() (rv error) { + for _, searcher := range s.searchers { + err := searcher.Close() + if err != nil && rv == nil { + rv = err + } + } + return rv +} + +func (s *DisjunctionHeapSearcher) Min() int { + return s.min +} + +func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int { + rv := len(s.searchers) + for _, s := range s.searchers { + rv += s.DocumentMatchPoolSize() + } + return rv +} + +// a disjunction searcher implements the index.Optimizable interface +// but only activates on an edge case where the disjunction is a +// wrapper around a single Optimizable child searcher +func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) ( + index.OptimizableContext, error) { + if len(s.searchers) == 1 { + o, ok := s.searchers[0].(index.Optimizable) + if ok { + return o.Optimize(kind, octx) + } + } + + return octx, nil +} + +// heap impl + +func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) } + +func (s *DisjunctionHeapSearcher) Less(i, j int) bool { + if s.heap[i].curr == nil { + return true + } else if s.heap[j].curr == nil { + return false + } + return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 +} + +func (s *DisjunctionHeapSearcher) Swap(i, j int) { + s.heap[i], s.heap[j] = s.heap[j], s.heap[i] +} + +func (s *DisjunctionHeapSearcher) Push(x interface{}) { + s.heap = append(s.heap, x.(*SearcherCurr)) +} + +func (s *DisjunctionHeapSearcher) Pop() interface{} { + old := s.heap + n := len(old) + x := old[n-1] + s.heap = old[0 : n-1] + return x +} diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go new file mode 100644 index 000000000..e47f39ad0 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go @@ -0,0 +1,298 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "math" + "reflect" + "sort" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeDisjunctionSliceSearcher int + +func init() { + var ds DisjunctionSliceSearcher + reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size()) +} + +type DisjunctionSliceSearcher struct { + indexReader index.IndexReader + searchers OrderedSearcherList + numSearchers int + queryNorm float64 + currs []*search.DocumentMatch + scorer *scorer.DisjunctionQueryScorer + min int + matching []*search.DocumentMatch + matchingIdxs []int + initialized bool +} + +func newDisjunctionSliceSearcher(indexReader index.IndexReader, + qsearchers []search.Searcher, min float64, options search.SearcherOptions, + limit bool) ( + *DisjunctionSliceSearcher, error) { + if limit && tooManyClauses(len(qsearchers)) { + return nil, tooManyClausesErr(len(qsearchers)) + } + // build the downstream searchers + searchers := make(OrderedSearcherList, len(qsearchers)) + for i, searcher := range qsearchers { + searchers[i] = searcher + } + // sort the searchers + sort.Sort(sort.Reverse(searchers)) + // build our searcher + rv := DisjunctionSliceSearcher{ + indexReader: indexReader, + searchers: searchers, + numSearchers: len(searchers), + currs: make([]*search.DocumentMatch, len(searchers)), + scorer: scorer.NewDisjunctionQueryScorer(options), + min: int(min), + matching: make([]*search.DocumentMatch, len(searchers)), + matchingIdxs: make([]int, len(searchers)), + } + rv.computeQueryNorm() + return &rv, nil +} + +func (s *DisjunctionSliceSearcher) Size() int { + sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + for _, entry := range s.matching { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt + + return sizeInBytes +} + +func (s *DisjunctionSliceSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + +func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error { + var err error + // get all searchers pointing at their first match + for i, searcher := range s.searchers { + if s.currs[i] != nil { + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Next(ctx) + if err != nil { + return err + } + } + + err = s.updateMatches() + if err != nil { + return err + } + + s.initialized = true + return nil +} + +func (s *DisjunctionSliceSearcher) updateMatches() error { + matching := s.matching[:0] + matchingIdxs := s.matchingIdxs[:0] + + for i := 0; i < len(s.currs); i++ { + curr := s.currs[i] + if curr == nil { + continue + } + + if len(matching) > 0 { + cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) + if cmp > 0 { + continue + } + + if cmp < 0 { + matching = matching[:0] + matchingIdxs = matchingIdxs[:0] + } + } + + matching = append(matching, curr) + matchingIdxs = append(matchingIdxs, i) + } + + s.matching = matching + s.matchingIdxs = matchingIdxs + + return nil +} + +func (s *DisjunctionSliceSearcher) Weight() float64 { + var rv float64 + for _, searcher := range s.searchers { + rv += searcher.Weight() + } + return rv +} + +func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) { + for _, searcher := range s.searchers { + searcher.SetQueryNorm(qnorm) + } +} + +func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( + *search.DocumentMatch, error) { + if !s.initialized { + err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + } + var err error + var rv *search.DocumentMatch + + found := false + for !found && len(s.matching) > 0 { + if len(s.matching) >= s.min { + found = true + // score this match + rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) + } + + // invoke next on all the matching searchers + for _, i := range s.matchingIdxs { + searcher := s.searchers[i] + if s.currs[i] != rv { + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Next(ctx) + if err != nil { + return nil, err + } + } + + err = s.updateMatches() + if err != nil { + return nil, err + } + } + return rv, nil +} + +func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext, + ID index.IndexInternalID) (*search.DocumentMatch, error) { + if !s.initialized { + err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + } + // get all searchers pointing at their first match + var err error + for i, searcher := range s.searchers { + if s.currs[i] != nil { + if s.currs[i].IndexInternalID.Compare(ID) >= 0 { + continue + } + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Advance(ctx, ID) + if err != nil { + return nil, err + } + } + + err = s.updateMatches() + if err != nil { + return nil, err + } + + return s.Next(ctx) +} + +func (s *DisjunctionSliceSearcher) Count() uint64 { + // for now return a worst case + var sum uint64 + for _, searcher := range s.searchers { + sum += searcher.Count() + } + return sum +} + +func (s *DisjunctionSliceSearcher) Close() (rv error) { + for _, searcher := range s.searchers { + err := searcher.Close() + if err != nil && rv == nil { + rv = err + } + } + return rv +} + +func (s *DisjunctionSliceSearcher) Min() int { + return s.min +} + +func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { + rv := len(s.currs) + for _, s := range s.searchers { + rv += s.DocumentMatchPoolSize() + } + return rv +} + +// a disjunction searcher implements the index.Optimizable interface +// but only activates on an edge case where the disjunction is a +// wrapper around a single Optimizable child searcher +func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) ( + index.OptimizableContext, error) { + if len(s.searchers) == 1 { + o, ok := s.searchers[0].(index.Optimizable) + if ok { + return o.Optimize(kind, octx) + } + } + + return octx, nil +} diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_docid.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_docid.go index 06351b4a0..3b258a580 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_docid.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_docid.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDocIDSearcher int + +func init() { + var ds DocIDSearcher + reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size()) +} + // DocIDSearcher returns documents matching a predefined set of identifiers. type DocIDSearcher struct { reader index.DocIDReader @@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64 }, nil } +func (s *DocIDSearcher) Size() int { + return reflectStaticSizeDocIDSearcher + size.SizeOfPtr + + s.reader.Size() + + s.scorer.Size() +} + func (s *DocIDSearcher) Count() uint64 { return uint64(s.count) } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_filter.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_filter.go index 219f2ee7e..7c95fb41c 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_filter.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_filter.go @@ -15,10 +15,20 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeFilteringSearcher int + +func init() { + var fs FilteringSearcher + reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size()) +} + // FilterFunc defines a function which can filter documents // returning true means keep the document // returning false means do not keep the document @@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch } } +func (f *FilteringSearcher) Size() int { + return reflectStaticSizeFilteringSearcher + size.SizeOfPtr + + f.child.Size() +} + func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { next, err := f.child.Next(ctx) for next != nil && err == nil { diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go index 90abaa0a8..8176e59b5 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go @@ -15,13 +15,26 @@ package searcher import ( + "fmt" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) +var MaxFuzziness = 2 + func NewFuzzySearcher(indexReader index.IndexReader, term string, prefix, fuzziness int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + + if fuzziness > MaxFuzziness { + return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness) + } + + if fuzziness < 0 { + return nil, fmt.Errorf("invalid fuzziness, negative") + } + // Note: we don't byte slice the term for a prefix because of runes. prefixTerm := "" for i, r := range term { @@ -31,7 +44,6 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, break } } - candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, field, prefixTerm) if err != nil { @@ -45,12 +57,40 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, fuzziness int, field, prefixTerm string) (rv []string, err error) { rv = make([]string, 0) + + // in case of advanced reader implementations directly call + // the levenshtein automaton based iterator to collect the + // candidate terms + if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { + fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + rv = append(rv, tfd.Term) + if tooManyClauses(len(rv)) { + return nil, tooManyClausesErr(len(rv)) + } + tfd, err = fieldDict.Next() + } + return rv, err + } + var fieldDict index.FieldDict if len(prefixTerm) > 0 { fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) } else { fieldDict, err = indexReader.FieldDict(field) } + if err != nil { + return nil, err + } defer func() { if cerr := fieldDict.Close(); cerr != nil && err == nil { err = cerr @@ -58,13 +98,16 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, }() // enumerate terms and check levenshtein distance + var reuse []int tfd, err := fieldDict.Next() for err == nil && tfd != nil { - ld, exceeded := search.LevenshteinDistanceMax(term, tfd.Term, fuzziness) + var ld int + var exceeded bool + ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse) if !exceeded && ld <= fuzziness { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return rv, tooManyClausesErr() + return nil, tooManyClausesErr(len(rv)) } } tfd, err = fieldDict.Next() diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_geoboundingbox.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_geoboundingbox.go index f8b1b4cf7..289e41678 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_geoboundingbox.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_geoboundingbox.go @@ -40,6 +40,11 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, minLon, minLat, maxLon, maxLat, checkBoundaries) var onBoundarySearcher search.Searcher + dvReader, err := indexReader.DocValueReader([]string{field}) + if err != nil { + return nil, err + } + if len(onBoundaryTerms) > 0 { rawOnBoundarySearcher, err := NewMultiTermSearcherBytes(indexReader, onBoundaryTerms, field, boost, options, false) @@ -48,7 +53,7 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, } // add filter to check points near the boundary onBoundarySearcher = NewFilteringSearcher(rawOnBoundarySearcher, - buildRectFilter(indexReader, field, minLon, minLat, maxLon, maxLat)) + buildRectFilter(dvReader, field, minLon, minLat, maxLon, maxLat)) openedSearchers = append(openedSearchers, onBoundarySearcher) } @@ -144,26 +149,25 @@ func relateAndRecurse(start, end uint64, res uint, return nil, nil } -func buildRectFilter(indexReader index.IndexReader, field string, +func buildRectFilter(dvReader index.DocValueReader, field string, minLon, minLat, maxLon, maxLat float64) FilterFunc { return func(d *search.DocumentMatch) bool { var lon, lat float64 var found bool - err := indexReader.DocumentVisitFieldTerms(d.IndexInternalID, - []string{field}, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - var i64 int64 - i64, err = prefixCoded.Int64() - if err == nil { - lon = geo.MortonUnhashLon(uint64(i64)) - lat = geo.MortonUnhashLat(uint64(i64)) - found = true - } + err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + var i64 int64 + i64, err = prefixCoded.Int64() + if err == nil { + lon = geo.MortonUnhashLon(uint64(i64)) + lat = geo.MortonUnhashLat(uint64(i64)) + found = true } - }) + } + }) if err == nil && found { return geo.BoundingBoxContains(lon, lat, minLon, minLat, maxLon, maxLat) diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_geopointdistance.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_geopointdistance.go index fd559766f..a15c194e8 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_geopointdistance.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_geopointdistance.go @@ -39,9 +39,14 @@ func NewGeoPointDistanceSearcher(indexReader index.IndexReader, centerLon, return nil, err } + dvReader, err := indexReader.DocValueReader([]string{field}) + if err != nil { + return nil, err + } + // wrap it in a filtering searcher which checks the actual distance return NewFilteringSearcher(boxSearcher, - buildDistFilter(indexReader, field, centerLon, centerLat, dist)), nil + buildDistFilter(dvReader, field, centerLon, centerLat, dist)), nil } // boxSearcher builds a searcher for the described bounding box @@ -87,25 +92,25 @@ func boxSearcher(indexReader index.IndexReader, return boxSearcher, nil } -func buildDistFilter(indexReader index.IndexReader, field string, +func buildDistFilter(dvReader index.DocValueReader, field string, centerLon, centerLat, maxDist float64) FilterFunc { return func(d *search.DocumentMatch) bool { var lon, lat float64 var found bool - err := indexReader.DocumentVisitFieldTerms(d.IndexInternalID, - []string{field}, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - i64, err := prefixCoded.Int64() - if err == nil { - lon = geo.MortonUnhashLon(uint64(i64)) - lat = geo.MortonUnhashLat(uint64(i64)) - found = true - } + + err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + i64, err := prefixCoded.Int64() + if err == nil { + lon = geo.MortonUnhashLon(uint64(i64)) + lat = geo.MortonUnhashLat(uint64(i64)) + found = true } - }) + } + }) if err == nil && found { dist := geo.Haversin(lon, lat, centerLon, centerLat) if dist <= maxDist/1000 { diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_match_all.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_match_all.go index 822db2ea0..bb6640122 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_match_all.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_match_all.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeMatchAllSearcher int + +func init() { + var mas MatchAllSearcher + reflectStaticSizeMatchAllSearcher = int(reflect.TypeOf(mas).Size()) +} + type MatchAllSearcher struct { indexReader index.IndexReader reader index.DocIDReader @@ -46,6 +56,12 @@ func NewMatchAllSearcher(indexReader index.IndexReader, boost float64, options s }, nil } +func (s *MatchAllSearcher) Size() int { + return reflectStaticSizeMatchAllSearcher + size.SizeOfPtr + + s.reader.Size() + + s.scorer.Size() +} + func (s *MatchAllSearcher) Count() uint64 { return s.count } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_match_none.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_match_none.go index 947596714..a345e17f7 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_match_none.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_match_none.go @@ -15,10 +15,20 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeMatchNoneSearcher int + +func init() { + var mns MatchNoneSearcher + reflectStaticSizeMatchNoneSearcher = int(reflect.TypeOf(mns).Size()) +} + type MatchNoneSearcher struct { indexReader index.IndexReader } @@ -29,6 +39,10 @@ func NewMatchNoneSearcher(indexReader index.IndexReader) (*MatchNoneSearcher, er }, nil } +func (s *MatchNoneSearcher) Size() int { + return reflectStaticSizeMatchNoneSearcher + size.SizeOfPtr +} + func (s *MatchNoneSearcher) Count() uint64 { return uint64(0) } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_multi_term.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_multi_term.go index b469beadb..c48366ee2 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_multi_term.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_multi_term.go @@ -22,6 +22,10 @@ import ( func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { + if limit && tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(len(terms)) + } + qsearchers := make([]search.Searcher, len(terms)) qsearchersClose := func() { for _, searcher := range qsearchers { @@ -46,6 +50,10 @@ func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { + if limit && tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(len(terms)) + } + qsearchers := make([]search.Searcher, len(terms)) qsearchersClose := func() { for _, searcher := range qsearchers { diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_numeric_range.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_numeric_range.go index 7f42d7250..e52ef9a82 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_numeric_range.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_numeric_range.go @@ -68,7 +68,7 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, return nil, err } if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(terms)) } return NewMultiTermSearcherBytes(indexReader, terms, field, boost, options, @@ -77,6 +77,25 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, func filterCandidateTerms(indexReader index.IndexReader, terms [][]byte, field string) (rv [][]byte, err error) { + + if ir, ok := indexReader.(index.IndexReaderOnly); ok { + fieldDict, err := ir.FieldDictOnly(field, terms, false) + if err != nil { + return nil, err + } + // enumerate the terms (no need to check them again) + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + rv = append(rv, []byte(tfd.Term)) + tfd, err = fieldDict.Next() + } + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + + return rv, err + } + fieldDict, err := indexReader.FieldDictRange(field, terms[0], terms[len(terms)-1]) if err != nil { return nil, err diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_phrase.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_phrase.go index 6237cecfd..51b7e5bd8 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_phrase.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_phrase.go @@ -17,21 +17,52 @@ package searcher import ( "fmt" "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizePhraseSearcher int + +func init() { + var ps PhraseSearcher + reflectStaticSizePhraseSearcher = int(reflect.TypeOf(ps).Size()) +} + type PhraseSearcher struct { - indexReader index.IndexReader - mustSearcher *ConjunctionSearcher + mustSearcher search.Searcher queryNorm float64 currMust *search.DocumentMatch - slop int terms [][]string + path phrasePath + paths []phrasePath + locations []search.Location initialized bool } +func (s *PhraseSearcher) Size() int { + sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr + + if s.mustSearcher != nil { + sizeInBytes += s.mustSearcher.Size() + } + + if s.currMust != nil { + sizeInBytes += s.currMust.Size() + } + + for _, entry := range s.terms { + sizeInBytes += size.SizeOfSlice + for _, entry1 := range entry { + sizeInBytes += size.SizeOfString + len(entry1) + } + } + + return sizeInBytes +} + func NewPhraseSearcher(indexReader index.IndexReader, terms []string, field string, options search.SearcherOptions) (*PhraseSearcher, error) { // turn flat terms []string into [][]string mterms := make([][]string, len(terms)) @@ -96,7 +127,6 @@ func NewMultiPhraseSearcher(indexReader index.IndexReader, terms [][]string, fie // build our searcher rv := PhraseSearcher{ - indexReader: indexReader, mustSearcher: mustSearcher, terms: terms, } @@ -133,6 +163,9 @@ func (s *PhraseSearcher) advanceNextMust(ctx *search.SearchContext) error { var err error if s.mustSearcher != nil { + if s.currMust != nil { + ctx.DocumentMatchPool.Put(s.currMust) + } s.currMust, err = s.mustSearcher.Next(ctx) if err != nil { return err @@ -177,48 +210,64 @@ func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, return nil, nil } -// checkCurrMustMatch is soley concerned with determining if the DocumentMatch +// checkCurrMustMatch is solely concerned with determining if the DocumentMatch // pointed to by s.currMust (which satisifies the pre-condition searcher) // also satisfies the phase constraints. if so, it returns a DocumentMatch // for this document, otherwise nil func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.DocumentMatch { - rvftlm := make(search.FieldTermLocationMap, 0) - freq := 0 + s.locations = s.currMust.Complete(s.locations) + + locations := s.currMust.Locations + s.currMust.Locations = nil + + ftls := s.currMust.FieldTermLocations + // typically we would expect there to only actually be results in // one field, but we allow for this to not be the case // but, we note that phrase constraints can only be satisfied within // a single field, so we can check them each independently - for field, tlm := range s.currMust.Locations { - - f, rvtlm := s.checkCurrMustMatchField(ctx, tlm) - if f > 0 { - freq += f - rvftlm[field] = rvtlm - } + for field, tlm := range locations { + ftls = s.checkCurrMustMatchField(ctx, field, tlm, ftls) } - if freq > 0 { + if len(ftls) > 0 { // return match rv := s.currMust - rv.Locations = rvftlm + s.currMust = nil + rv.FieldTermLocations = ftls return rv } return nil } -// checkCurrMustMatchField is soley concerned with determining if one particular -// field within the currMust DocumentMatch Locations satisfies the phase -// constraints (possibly more than once). if so, the number of times it was -// satisfied, and these locations are returned. otherwise 0 and either -// a nil or empty TermLocationMap -func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) (int, search.TermLocationMap) { - paths := findPhrasePaths(0, nil, s.terms, tlm, nil, 0) - rv := make(search.TermLocationMap, len(s.terms)) - for _, p := range paths { - p.MergeInto(rv) +// checkCurrMustMatchField is solely concerned with determining if one +// particular field within the currMust DocumentMatch Locations +// satisfies the phase constraints (possibly more than once). if so, +// the matching field term locations are appended to the provided +// slice +func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, + field string, tlm search.TermLocationMap, + ftls []search.FieldTermLocation) []search.FieldTermLocation { + if s.path == nil { + s.path = make(phrasePath, 0, len(s.terms)) } - return len(paths), rv + s.paths = findPhrasePaths(0, nil, s.terms, tlm, s.path[:0], 0, s.paths[:0]) + for _, p := range s.paths { + for _, pp := range p { + ftls = append(ftls, search.FieldTermLocation{ + Field: field, + Term: pp.term, + Location: search.Location{ + Pos: pp.loc.Pos, + Start: pp.loc.Start, + End: pp.loc.End, + ArrayPositions: pp.loc.ArrayPositions, + }, + }) + } + } + return ftls } type phrasePart struct { @@ -230,7 +279,7 @@ func (p *phrasePart) String() string { return fmt.Sprintf("[%s %v]", p.term, p.loc) } -type phrasePath []*phrasePart +type phrasePath []phrasePart func (p phrasePath) MergeInto(in search.TermLocationMap) { for _, pp := range p { @@ -238,24 +287,51 @@ func (p phrasePath) MergeInto(in search.TermLocationMap) { } } -// findPhrasePaths is a function to identify phase matches from a set of known -// term locations. the implementation is recursive, so care must be taken -// with arguments and return values. +func (p phrasePath) String() string { + rv := "[" + for i, pp := range p { + if i > 0 { + rv += ", " + } + rv += pp.String() + } + rv += "]" + return rv +} + +// findPhrasePaths is a function to identify phase matches from a set +// of known term locations. it recursive so care must be taken with +// arguments and return values. // -// prev - the previous location, nil on first invocation -// phraseTerms - slice containing the phrase terms themselves +// prevPos - the previous location, 0 on first invocation +// ap - array positions of the first candidate phrase part to +// which further recursive phrase parts must match, +// nil on initial invocation or when there are no array positions +// phraseTerms - slice containing the phrase terms, // may contain empty string as placeholder (don't care) // tlm - the Term Location Map containing all relevant term locations -// offset - the offset from the previous that this next term must match // p - the current path being explored (appended to in recursive calls) // this is the primary state being built during the traversal +// remainingSlop - amount of sloppiness that's allowed, which is the +// sum of the editDistances from each matching phrase part, +// where 0 means no sloppiness allowed (all editDistances must be 0), +// decremented during recursion +// rv - the final result being appended to by all the recursive calls // // returns slice of paths, or nil if invocation did not find any successul paths -func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, tlm search.TermLocationMap, p phrasePath, remainingSlop int) []phrasePath { - +func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, + tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath { // no more terms if len(phraseTerms) < 1 { - return []phrasePath{p} + // snapshot or copy the recursively built phrasePath p and + // append it to the rv, also optimizing by checking if next + // phrasePath item in the rv (which we're about to overwrite) + // is available for reuse + var pcopy phrasePath + if len(rv) < cap(rv) { + pcopy = rv[:len(rv)+1][len(rv)][:0] + } + return append(rv, append(pcopy, p...)) } car := phraseTerms[0] @@ -268,13 +344,13 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s // if prevPos was 0, don't set it to 1 (as thats not a real abs pos) nextPos = 0 // don't advance nextPos if prevPos was 0 } - return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop) + return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop, rv) } - var rv []phrasePath // locations for this term for _, carTerm := range car { locations := tlm[carTerm] + LOCATIONS_LOOP: for _, loc := range locations { if prevPos != 0 && !loc.ArrayPositions.Equals(ap) { // if the array positions are wrong, can't match, try next location @@ -287,11 +363,18 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s dist = editDistance(prevPos+1, loc.Pos) } - // if enough slop reamining, continue recursively + // if enough slop remaining, continue recursively if prevPos == 0 || (remainingSlop-dist) >= 0 { + // skip if we've already used this term+loc already + for _, ppart := range p { + if ppart.term == carTerm && ppart.loc == loc { + continue LOCATIONS_LOOP + } + } + // this location works, add it to the path (but not for empty term) - px := append(p, &phrasePart{term: carTerm, loc: loc}) - rv = append(rv, findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist)...) + px := append(p, phrasePart{term: carTerm, loc: loc}) + rv = findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist, rv) } } } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_regexp.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_regexp.go index b7cf520ac..4def832c4 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_regexp.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_regexp.go @@ -21,17 +21,57 @@ import ( "github.com/blevesearch/bleve/search" ) +// NewRegexpStringSearcher is similar to NewRegexpSearcher, but +// additionally optimizes for index readers that handle regexp's. +func NewRegexpStringSearcher(indexReader index.IndexReader, pattern string, + field string, boost float64, options search.SearcherOptions) ( + search.Searcher, error) { + ir, ok := indexReader.(index.IndexReaderRegexp) + if !ok { + r, err := regexp.Compile(pattern) + if err != nil { + return nil, err + } + + return NewRegexpSearcher(indexReader, r, field, boost, options) + } + + fieldDict, err := ir.FieldDictRegexp(field, pattern) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + + var candidateTerms []string + + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + candidateTerms = append(candidateTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + if err != nil { + return nil, err + } + + return NewMultiTermSearcher(indexReader, candidateTerms, field, boost, + options, true) +} + // NewRegexpSearcher creates a searcher which will match documents that // contain terms which match the pattern regexp. The match must be EXACT // matching the entire term. The provided regexp SHOULD NOT start with ^ // or end with $ as this can intefere with the implementation. Separately, // matches will be checked to ensure they match the entire term. -func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, +func NewRegexpSearcher(indexReader index.IndexReader, pattern index.Regexp, field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { + var candidateTerms []string prefixTerm, complete := pattern.LiteralPrefix() - var candidateTerms []string if complete { // there is no pattern candidateTerms = []string{prefixTerm} @@ -49,7 +89,7 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, } func findRegexpCandidateTerms(indexReader index.IndexReader, - pattern *regexp.Regexp, field, prefixTerm string) (rv []string, err error) { + pattern index.Regexp, field, prefixTerm string) (rv []string, err error) { rv = make([]string, 0) var fieldDict index.FieldDict if len(prefixTerm) > 0 { @@ -70,7 +110,7 @@ func findRegexpCandidateTerms(indexReader index.IndexReader, if matchPos != nil && matchPos[0] == 0 && matchPos[1] == len(tfd.Term) { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return rv, tooManyClausesErr() + return rv, tooManyClausesErr(len(rv)) } } tfd, err = fieldDict.Next() diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_term.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_term.go index 6fae6ae5a..c1af74c76 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_term.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_term.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermSearcher int + +func init() { + var ts TermSearcher + reflectStaticSizeTermSearcher = int(reflect.TypeOf(ts).Size()) +} + type TermSearcher struct { indexReader index.IndexReader reader index.TermFieldReader @@ -28,28 +38,20 @@ type TermSearcher struct { } func NewTermSearcher(indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { - reader, err := indexReader.TermFieldReader([]byte(term), field, true, true, options.IncludeTermVectors) - if err != nil { - return nil, err - } - count, err := indexReader.DocCount() - if err != nil { - _ = reader.Close() - return nil, err - } - scorer := scorer.NewTermQueryScorer([]byte(term), field, boost, count, reader.Count(), options) - return &TermSearcher{ - indexReader: indexReader, - reader: reader, - scorer: scorer, - }, nil + return NewTermSearcherBytes(indexReader, []byte(term), field, boost, options) } func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { - reader, err := indexReader.TermFieldReader(term, field, true, true, options.IncludeTermVectors) + needFreqNorm := options.Score != "none" + reader, err := indexReader.TermFieldReader(term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { return nil, err } + return newTermSearcherFromReader(indexReader, reader, term, field, boost, options) +} + +func newTermSearcherFromReader(indexReader index.IndexReader, reader index.TermFieldReader, + term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { count, err := indexReader.DocCount() if err != nil { _ = reader.Close() @@ -63,6 +65,13 @@ func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field stri }, nil } +func (s *TermSearcher) Size() int { + return reflectStaticSizeTermSearcher + size.SizeOfPtr + + s.reader.Size() + + s.tfd.Size() + + s.scorer.Size() +} + func (s *TermSearcher) Count() uint64 { return s.reader.Count() } @@ -120,3 +129,13 @@ func (s *TermSearcher) Min() int { func (s *TermSearcher) DocumentMatchPoolSize() int { return 1 } + +func (s *TermSearcher) Optimize(kind string, octx index.OptimizableContext) ( + index.OptimizableContext, error) { + o, ok := s.reader.(index.Optimizable) + if ok { + return o.Optimize(kind, octx) + } + + return octx, nil +} diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_term_prefix.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_term_prefix.go index 05d092249..b5af4631f 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_term_prefix.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_term_prefix.go @@ -27,13 +27,24 @@ func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string, if err != nil { return nil, err } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() var terms []string tfd, err := fieldDict.Next() for err == nil && tfd != nil { terms = append(terms, tfd.Term) + if tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(len(terms)) + } tfd, err = fieldDict.Next() } + if err != nil { + return nil, err + } return NewMultiTermSearcher(indexReader, terms, field, boost, options, true) } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_term_range.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_term_range.go index 267c681b4..90be1e11a 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_term_range.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_term_range.go @@ -48,6 +48,12 @@ func NewTermRangeSearcher(indexReader index.IndexReader, return nil, err } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + var terms []string tfd, err := fieldDict.Next() for err == nil && tfd != nil { diff --git a/vendor/github.com/blevesearch/bleve/search/sort.go b/vendor/github.com/blevesearch/bleve/search/sort.go index 28705d369..e17f70787 100644 --- a/vendor/github.com/blevesearch/bleve/search/sort.go +++ b/vendor/github.com/blevesearch/bleve/search/sort.go @@ -15,6 +15,7 @@ package search import ( + "bytes" "encoding/json" "fmt" "math" @@ -251,23 +252,21 @@ func (so SortOrder) Compare(cachedScoring, cachedDesc []bool, i, j *DocumentMatc } func (so SortOrder) RequiresScore() bool { - rv := false for _, soi := range so { if soi.RequiresScoring() { - rv = true + return true } } - return rv + return false } func (so SortOrder) RequiresDocID() bool { - rv := false for _, soi := range so { if soi.RequiresDocID() { - rv = true + return true } } - return rv + return false } func (so SortOrder) RequiredFields() []string { @@ -279,7 +278,7 @@ func (so SortOrder) RequiredFields() []string { } func (so SortOrder) CacheIsScore() []bool { - var rv []bool + rv := make([]bool, 0, len(so)) for _, soi := range so { rv = append(rv, soi.RequiresScoring()) } @@ -287,7 +286,7 @@ func (so SortOrder) CacheIsScore() []bool { } func (so SortOrder) CacheDescending() []bool { - var rv []bool + rv := make([]bool, 0, len(so)) for _, soi := range so { rv = append(rv, soi.Descending()) } @@ -344,14 +343,15 @@ type SortField struct { Type SortFieldType Mode SortFieldMode Missing SortFieldMissing - values []string + values [][]byte + tmp [][]byte } // UpdateVisitor notifies this sort field that in this document // this field has the specified term func (s *SortField) UpdateVisitor(field string, term []byte) { if field == s.Field { - s.values = append(s.values, string(term)) + s.values = append(s.values, term) } } @@ -361,7 +361,7 @@ func (s *SortField) UpdateVisitor(field string, term []byte) { func (s *SortField) Value(i *DocumentMatch) string { iTerms := s.filterTermsByType(s.values) iTerm := s.filterTermsByMode(iTerms) - s.values = nil + s.values = s.values[:0] return iTerm } @@ -370,17 +370,17 @@ func (s *SortField) Descending() bool { return s.Desc } -func (s *SortField) filterTermsByMode(terms []string) string { +func (s *SortField) filterTermsByMode(terms [][]byte) string { if len(terms) == 1 || (len(terms) > 1 && s.Mode == SortFieldDefault) { - return terms[0] + return string(terms[0]) } else if len(terms) > 1 { switch s.Mode { case SortFieldMin: - sort.Strings(terms) - return terms[0] + sort.Sort(BytesSlice(terms)) + return string(terms[0]) case SortFieldMax: - sort.Strings(terms) - return terms[len(terms)-1] + sort.Sort(BytesSlice(terms)) + return string(terms[len(terms)-1]) } } @@ -402,13 +402,13 @@ func (s *SortField) filterTermsByMode(terms []string) string { // return only the terms which had shift of 0 // if we are in explicit number or date mode, return only valid // prefix coded numbers with shift of 0 -func (s *SortField) filterTermsByType(terms []string) []string { +func (s *SortField) filterTermsByType(terms [][]byte) [][]byte { stype := s.Type if stype == SortFieldAuto { allTermsPrefixCoded := true - var termsWithShiftZero []string + termsWithShiftZero := s.tmp[:0] for _, term := range terms { - valid, shift := numeric.ValidPrefixCodedTerm(term) + valid, shift := numeric.ValidPrefixCodedTermBytes(term) if valid && shift == 0 { termsWithShiftZero = append(termsWithShiftZero, term) } else if !valid { @@ -417,16 +417,18 @@ func (s *SortField) filterTermsByType(terms []string) []string { } if allTermsPrefixCoded { terms = termsWithShiftZero + s.tmp = termsWithShiftZero[:0] } } else if stype == SortFieldAsNumber || stype == SortFieldAsDate { - var termsWithShiftZero []string + termsWithShiftZero := s.tmp[:0] for _, term := range terms { - valid, shift := numeric.ValidPrefixCodedTerm(term) + valid, shift := numeric.ValidPrefixCodedTermBytes(term) if valid && shift == 0 { termsWithShiftZero = append(termsWithShiftZero, term) } } terms = termsWithShiftZero + s.tmp = termsWithShiftZero[:0] } return terms } @@ -486,8 +488,7 @@ func (s *SortField) MarshalJSON() ([]byte, error) { } func (s *SortField) Copy() SearchSort { - var rv SortField - rv = *s + rv := *s return &rv } @@ -499,7 +500,6 @@ type SortDocID struct { // UpdateVisitor is a no-op for SortDocID as it's value // is not dependent on any field terms func (s *SortDocID) UpdateVisitor(field string, term []byte) { - } // Value returns the sort value of the DocumentMatch @@ -529,8 +529,7 @@ func (s *SortDocID) MarshalJSON() ([]byte, error) { } func (s *SortDocID) Copy() SearchSort { - var rv SortDocID - rv = *s + rv := *s return &rv } @@ -542,7 +541,6 @@ type SortScore struct { // UpdateVisitor is a no-op for SortScore as it's value // is not dependent on any field terms func (s *SortScore) UpdateVisitor(field string, term []byte) { - } // Value returns the sort value of the DocumentMatch @@ -572,8 +570,7 @@ func (s *SortScore) MarshalJSON() ([]byte, error) { } func (s *SortScore) Copy() SearchSort { - var rv SortScore - rv = *s + rv := *s return &rv } @@ -583,7 +580,6 @@ var maxDistance = string(numeric.MustNewPrefixCodedInt64(math.MaxInt64, 0)) // their distance from the specified point. func NewSortGeoDistance(field, unit string, lon, lat float64, desc bool) ( *SortGeoDistance, error) { - rv := &SortGeoDistance{ Field: field, Desc: desc, @@ -627,7 +623,7 @@ func (s *SortGeoDistance) UpdateVisitor(field string, term []byte) { func (s *SortGeoDistance) Value(i *DocumentMatch) string { iTerms := s.filterTermsByType(s.values) iTerm := s.filterTermsByMode(iTerms) - s.values = nil + s.values = s.values[:0] if iTerm == "" { return maxDistance @@ -705,7 +701,12 @@ func (s *SortGeoDistance) MarshalJSON() ([]byte, error) { } func (s *SortGeoDistance) Copy() SearchSort { - var rv SortGeoDistance - rv = *s + rv := *s return &rv } + +type BytesSlice [][]byte + +func (p BytesSlice) Len() int { return len(p) } +func (p BytesSlice) Less(i, j int) bool { return bytes.Compare(p[i], p[j]) < 0 } +func (p BytesSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } diff --git a/vendor/github.com/blevesearch/bleve/search/util.go b/vendor/github.com/blevesearch/bleve/search/util.go index 83212af1f..19dd5d68b 100644 --- a/vendor/github.com/blevesearch/bleve/search/util.go +++ b/vendor/github.com/blevesearch/bleve/search/util.go @@ -40,3 +40,30 @@ func MergeTermLocationMaps(rv, other TermLocationMap) TermLocationMap { } return rv } + +func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) []FieldTermLocation { + n := len(dest) + for _, dm := range matches { + n += len(dm.FieldTermLocations) + } + if cap(dest) < n { + dest = append(make([]FieldTermLocation, 0, n), dest...) + } + + for _, dm := range matches { + for _, ftl := range dm.FieldTermLocations { + dest = append(dest, FieldTermLocation{ + Field: ftl.Field, + Term: ftl.Term, + Location: Location{ + Pos: ftl.Location.Pos, + Start: ftl.Location.Start, + End: ftl.Location.End, + ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), + }, + }) + } + } + + return dest +} diff --git a/vendor/github.com/blevesearch/bleve/size/sizes.go b/vendor/github.com/blevesearch/bleve/size/sizes.go new file mode 100644 index 000000000..0990bf86e --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/size/sizes.go @@ -0,0 +1,59 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package size + +import ( + "reflect" +) + +func init() { + var b bool + SizeOfBool = int(reflect.TypeOf(b).Size()) + var f32 float32 + SizeOfFloat32 = int(reflect.TypeOf(f32).Size()) + var f64 float64 + SizeOfFloat64 = int(reflect.TypeOf(f64).Size()) + var i int + SizeOfInt = int(reflect.TypeOf(i).Size()) + var m map[int]int + SizeOfMap = int(reflect.TypeOf(m).Size()) + var ptr *int + SizeOfPtr = int(reflect.TypeOf(ptr).Size()) + var slice []int + SizeOfSlice = int(reflect.TypeOf(slice).Size()) + var str string + SizeOfString = int(reflect.TypeOf(str).Size()) + var u8 uint8 + SizeOfUint8 = int(reflect.TypeOf(u8).Size()) + var u16 uint16 + SizeOfUint16 = int(reflect.TypeOf(u16).Size()) + var u32 uint32 + SizeOfUint32 = int(reflect.TypeOf(u32).Size()) + var u64 uint64 + SizeOfUint64 = int(reflect.TypeOf(u64).Size()) +} + +var SizeOfBool int +var SizeOfFloat32 int +var SizeOfFloat64 int +var SizeOfInt int +var SizeOfMap int +var SizeOfPtr int +var SizeOfSlice int +var SizeOfString int +var SizeOfUint8 int +var SizeOfUint16 int +var SizeOfUint32 int +var SizeOfUint64 int diff --git a/vendor/github.com/boltdb/bolt/bolt_mips64le.go b/vendor/github.com/boltdb/bolt/bolt_mips64le.go deleted file mode 100644 index e8a9c3d14..000000000 --- a/vendor/github.com/boltdb/bolt/bolt_mips64le.go +++ /dev/null @@ -1,11 +0,0 @@ -// +build mips64le -package bolt - -// maxMapSize represents the largest mmap size supported by Bolt. -const maxMapSize = 0xFFFFFFFFFFFF // 256TB - -// maxAllocSize is the size used when creating array pointers. -const maxAllocSize = 0x7FFFFFFF - -// brokenUnaligned Are unaligned load/stores broken on this arch? -var brokenUnaligned = false diff --git a/vendor/github.com/boltdb/bolt/bolt_mipsle.go b/vendor/github.com/boltdb/bolt/bolt_mipsle.go deleted file mode 100644 index d5af4d128..000000000 --- a/vendor/github.com/boltdb/bolt/bolt_mipsle.go +++ /dev/null @@ -1,11 +0,0 @@ -// +build mipsle -package bolt - -// maxMapSize represents the largest mmap size supported by Bolt. -const maxMapSize = 0x40000000 // 1GB - -// maxAllocSize is the size used when creating array pointers. -const maxAllocSize = 0xFFFFFFF - -// brokenUnaligned Are unaligned load/stores broken on this arch? -var brokenUnaligned = false diff --git a/vendor/github.com/boltdb/bolt/freelist.go b/vendor/github.com/boltdb/bolt/freelist.go deleted file mode 100644 index aba48f58c..000000000 --- a/vendor/github.com/boltdb/bolt/freelist.go +++ /dev/null @@ -1,252 +0,0 @@ -package bolt - -import ( - "fmt" - "sort" - "unsafe" -) - -// freelist represents a list of all pages that are available for allocation. -// It also tracks pages that have been freed but are still in use by open transactions. -type freelist struct { - ids []pgid // all free and available free page ids. - pending map[txid][]pgid // mapping of soon-to-be free page ids by tx. - cache map[pgid]bool // fast lookup of all free and pending page ids. -} - -// newFreelist returns an empty, initialized freelist. -func newFreelist() *freelist { - return &freelist{ - pending: make(map[txid][]pgid), - cache: make(map[pgid]bool), - } -} - -// size returns the size of the page after serialization. -func (f *freelist) size() int { - n := f.count() - if n >= 0xFFFF { - // The first element will be used to store the count. See freelist.write. - n++ - } - return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * n) -} - -// count returns count of pages on the freelist -func (f *freelist) count() int { - return f.free_count() + f.pending_count() -} - -// free_count returns count of free pages -func (f *freelist) free_count() int { - return len(f.ids) -} - -// pending_count returns count of pending pages -func (f *freelist) pending_count() int { - var count int - for _, list := range f.pending { - count += len(list) - } - return count -} - -// copyall copies into dst a list of all free ids and all pending ids in one sorted list. -// f.count returns the minimum length required for dst. -func (f *freelist) copyall(dst []pgid) { - m := make(pgids, 0, f.pending_count()) - for _, list := range f.pending { - m = append(m, list...) - } - sort.Sort(m) - mergepgids(dst, f.ids, m) -} - -// allocate returns the starting page id of a contiguous list of pages of a given size. -// If a contiguous block cannot be found then 0 is returned. -func (f *freelist) allocate(n int) pgid { - if len(f.ids) == 0 { - return 0 - } - - var initial, previd pgid - for i, id := range f.ids { - if id <= 1 { - panic(fmt.Sprintf("invalid page allocation: %d", id)) - } - - // Reset initial page if this is not contiguous. - if previd == 0 || id-previd != 1 { - initial = id - } - - // If we found a contiguous block then remove it and return it. - if (id-initial)+1 == pgid(n) { - // If we're allocating off the beginning then take the fast path - // and just adjust the existing slice. This will use extra memory - // temporarily but the append() in free() will realloc the slice - // as is necessary. - if (i + 1) == n { - f.ids = f.ids[i+1:] - } else { - copy(f.ids[i-n+1:], f.ids[i+1:]) - f.ids = f.ids[:len(f.ids)-n] - } - - // Remove from the free cache. - for i := pgid(0); i < pgid(n); i++ { - delete(f.cache, initial+i) - } - - return initial - } - - previd = id - } - return 0 -} - -// free releases a page and its overflow for a given transaction id. -// If the page is already free then a panic will occur. -func (f *freelist) free(txid txid, p *page) { - if p.id <= 1 { - panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id)) - } - - // Free page and all its overflow pages. - var ids = f.pending[txid] - for id := p.id; id <= p.id+pgid(p.overflow); id++ { - // Verify that page is not already free. - if f.cache[id] { - panic(fmt.Sprintf("page %d already freed", id)) - } - - // Add to the freelist and cache. - ids = append(ids, id) - f.cache[id] = true - } - f.pending[txid] = ids -} - -// release moves all page ids for a transaction id (or older) to the freelist. -func (f *freelist) release(txid txid) { - m := make(pgids, 0) - for tid, ids := range f.pending { - if tid <= txid { - // Move transaction's pending pages to the available freelist. - // Don't remove from the cache since the page is still free. - m = append(m, ids...) - delete(f.pending, tid) - } - } - sort.Sort(m) - f.ids = pgids(f.ids).merge(m) -} - -// rollback removes the pages from a given pending tx. -func (f *freelist) rollback(txid txid) { - // Remove page ids from cache. - for _, id := range f.pending[txid] { - delete(f.cache, id) - } - - // Remove pages from pending list. - delete(f.pending, txid) -} - -// freed returns whether a given page is in the free list. -func (f *freelist) freed(pgid pgid) bool { - return f.cache[pgid] -} - -// read initializes the freelist from a freelist page. -func (f *freelist) read(p *page) { - // If the page.count is at the max uint16 value (64k) then it's considered - // an overflow and the size of the freelist is stored as the first element. - idx, count := 0, int(p.count) - if count == 0xFFFF { - idx = 1 - count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0]) - } - - // Copy the list of page ids from the freelist. - if count == 0 { - f.ids = nil - } else { - ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count] - f.ids = make([]pgid, len(ids)) - copy(f.ids, ids) - - // Make sure they're sorted. - sort.Sort(pgids(f.ids)) - } - - // Rebuild the page cache. - f.reindex() -} - -// write writes the page ids onto a freelist page. All free and pending ids are -// saved to disk since in the event of a program crash, all pending ids will -// become free. -func (f *freelist) write(p *page) error { - // Combine the old free pgids and pgids waiting on an open transaction. - - // Update the header flag. - p.flags |= freelistPageFlag - - // The page.count can only hold up to 64k elements so if we overflow that - // number then we handle it by putting the size in the first element. - lenids := f.count() - if lenids == 0 { - p.count = uint16(lenids) - } else if lenids < 0xFFFF { - p.count = uint16(lenids) - f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:]) - } else { - p.count = 0xFFFF - ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(lenids) - f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:]) - } - - return nil -} - -// reload reads the freelist from a page and filters out pending items. -func (f *freelist) reload(p *page) { - f.read(p) - - // Build a cache of only pending pages. - pcache := make(map[pgid]bool) - for _, pendingIDs := range f.pending { - for _, pendingID := range pendingIDs { - pcache[pendingID] = true - } - } - - // Check each page in the freelist and build a new available freelist - // with any pages not in the pending lists. - var a []pgid - for _, id := range f.ids { - if !pcache[id] { - a = append(a, id) - } - } - f.ids = a - - // Once the available list is rebuilt then rebuild the free cache so that - // it includes the available and pending free pages. - f.reindex() -} - -// reindex rebuilds the free cache based on available and pending free lists. -func (f *freelist) reindex() { - f.cache = make(map[pgid]bool, len(f.ids)) - for _, id := range f.ids { - f.cache[id] = true - } - for _, pendingIDs := range f.pending { - for _, pendingID := range pendingIDs { - f.cache[pendingID] = true - } - } -} diff --git a/vendor/github.com/couchbase/vellum/automaton.go b/vendor/github.com/couchbase/vellum/automaton.go index 47526595b..70398f2d4 100644 --- a/vendor/github.com/couchbase/vellum/automaton.go +++ b/vendor/github.com/couchbase/vellum/automaton.go @@ -81,5 +81,5 @@ func (m *AlwaysMatch) Accept(int, byte) int { return 0 } -// creating an alwaysMatchAutomaton to avoid unnecesary repeated allocations. +// creating an alwaysMatchAutomaton to avoid unnecessary repeated allocations. var alwaysMatchAutomaton = &AlwaysMatch{} diff --git a/vendor/github.com/couchbase/vellum/builder.go b/vendor/github.com/couchbase/vellum/builder.go index b21db9807..f79332957 100644 --- a/vendor/github.com/couchbase/vellum/builder.go +++ b/vendor/github.com/couchbase/vellum/builder.go @@ -38,8 +38,7 @@ type Builder struct { encoder encoder opts *BuilderOpts - builderNodePool builderNodePool - transitionPool transitionPool + builderNodePool *builderNodePool } const noneAddr = 1 @@ -51,12 +50,14 @@ func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) { if opts == nil { opts = defaultBuilderOpts } + builderNodePool := &builderNodePool{} rv := &Builder{ - registry: newRegistry(opts.RegistryTableSize, opts.RegistryMRUSize), - opts: opts, - lastAddr: noneAddr, + unfinished: newUnfinishedNodes(builderNodePool), + registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize), + builderNodePool: builderNodePool, + opts: opts, + lastAddr: noneAddr, } - rv.unfinished = newUnfinishedNodes(&rv.builderNodePool) var err error rv.encoder, err = loadEncoder(opts.Encoder, w) @@ -71,9 +72,7 @@ func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) { } func (b *Builder) Reset(w io.Writer) error { - b.transitionPool.reset() - b.builderNodePool.reset() - b.unfinished.Reset(&b.builderNodePool) + b.unfinished.Reset() b.registry.Reset() b.lastAddr = noneAddr b.encoder.reset(w) @@ -107,7 +106,7 @@ func (b *Builder) Insert(key []byte, val uint64) error { return err } b.copyLastKey(key) - b.unfinished.addSuffix(key[prefixLen:], out, &b.builderNodePool) + b.unfinished.addSuffix(key[prefixLen:], out) return nil } @@ -142,7 +141,7 @@ func (b *Builder) compileFrom(iState int) error { if addr == noneAddr { node = b.unfinished.popEmpty() } else { - node = b.unfinished.popFreeze(addr, &b.transitionPool) + node = b.unfinished.popFreeze(addr) } var err error addr, err = b.compile(node) @@ -150,7 +149,7 @@ func (b *Builder) compileFrom(iState int) error { return nil } } - b.unfinished.topLastFreeze(addr, &b.transitionPool) + b.unfinished.topLastFreeze(addr) return nil } @@ -183,22 +182,25 @@ type unfinishedNodes struct { // this means calls get() and pushXYZ() must be paired, // as well as calls put() and popXYZ() cache []builderNodeUnfinished + + builderNodePool *builderNodePool } -func (u *unfinishedNodes) Reset(p *builderNodePool) { +func (u *unfinishedNodes) Reset() { u.stack = u.stack[:0] for i := 0; i < len(u.cache); i++ { u.cache[i] = builderNodeUnfinished{} } - u.pushEmpty(false, p) + u.pushEmpty(false) } func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes { rv := &unfinishedNodes{ - stack: make([]*builderNodeUnfinished, 0, 64), - cache: make([]builderNodeUnfinished, 64), + stack: make([]*builderNodeUnfinished, 0, 64), + cache: make([]builderNodeUnfinished, 64), + builderNodePool: p, } - rv.pushEmpty(false, p) + rv.pushEmpty(false) return rv } @@ -249,9 +251,9 @@ func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte, return i, out } -func (u *unfinishedNodes) pushEmpty(final bool, p *builderNodePool) { +func (u *unfinishedNodes) pushEmpty(final bool) { next := u.get() - next.node = p.alloc() + next.node = u.builderNodePool.Get() next.node.final = final u.stack = append(u.stack, next) } @@ -265,11 +267,11 @@ func (u *unfinishedNodes) popRoot() *builderNode { return rv } -func (u *unfinishedNodes) popFreeze(addr int, tp *transitionPool) *builderNode { +func (u *unfinishedNodes) popFreeze(addr int) *builderNode { l := len(u.stack) var unfinished *builderNodeUnfinished u.stack, unfinished = u.stack[:l-1], u.stack[l-1] - unfinished.lastCompiled(addr, tp) + unfinished.lastCompiled(addr) rv := unfinished.node u.put() return rv @@ -289,12 +291,12 @@ func (u *unfinishedNodes) setRootOutput(out uint64) { u.stack[0].node.finalOutput = out } -func (u *unfinishedNodes) topLastFreeze(addr int, tp *transitionPool) { +func (u *unfinishedNodes) topLastFreeze(addr int) { last := len(u.stack) - 1 - u.stack[last].lastCompiled(addr, tp) + u.stack[last].lastCompiled(addr) } -func (u *unfinishedNodes) addSuffix(bs []byte, out uint64, p *builderNodePool) { +func (u *unfinishedNodes) addSuffix(bs []byte, out uint64) { if len(bs) == 0 { return } @@ -304,13 +306,13 @@ func (u *unfinishedNodes) addSuffix(bs []byte, out uint64, p *builderNodePool) { u.stack[last].lastOut = out for _, b := range bs[1:] { next := u.get() - next.node = p.alloc() + next.node = u.builderNodePool.Get() next.hasLastT = true next.lastIn = b next.lastOut = 0 u.stack = append(u.stack, next) } - u.pushEmpty(true, p) + u.pushEmpty(true) } type builderNodeUnfinished struct { @@ -320,17 +322,17 @@ type builderNodeUnfinished struct { hasLastT bool } -func (b *builderNodeUnfinished) lastCompiled(addr int, tp *transitionPool) { +func (b *builderNodeUnfinished) lastCompiled(addr int) { if b.hasLastT { transIn := b.lastIn transOut := b.lastOut b.hasLastT = false b.lastOut = 0 - trans := tp.alloc() - trans.in = transIn - trans.out = transOut - trans.addr = addr - b.node.trans = append(b.node.trans, trans) + b.node.trans = append(b.node.trans, transition{ + in: transIn, + out: transOut, + addr: addr, + }) } } @@ -338,8 +340,8 @@ func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) { if b.node.final { b.node.finalOutput = outputCat(prefix, b.node.finalOutput) } - for _, t := range b.node.trans { - t.out = outputCat(prefix, t.out) + for i := range b.node.trans { + b.node.trans[i].out = outputCat(prefix, b.node.trans[i].out) } if b.hasLastT { b.lastOut = outputCat(prefix, b.lastOut) @@ -348,8 +350,22 @@ func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) { type builderNode struct { finalOutput uint64 - trans []*transition + trans []transition final bool + + // intrusive linked list + next *builderNode +} + +// reset resets the receiver builderNode to a re-usable state. +func (n *builderNode) reset() { + n.final = false + n.finalOutput = 0 + for i := range n.trans { + n.trans[i] = emptyTransition + } + n.trans = n.trans[:0] + n.next = nil } func (n *builderNode) equiv(o *builderNode) bool { @@ -377,6 +393,8 @@ func (n *builderNode) equiv(o *builderNode) bool { return true } +var emptyTransition = transition{} + type transition struct { out uint64 addr int @@ -398,56 +416,37 @@ func outputCat(l, r uint64) uint64 { return l + r } -// the next builderNode to alloc() will be all[nextOuter][nextInner] +// builderNodePool pools builderNodes using a singly linked list. +// +// NB: builderNode lifecylce is described by the following interactions - +// +------------------------+ +----------------------+ +// | Unfinished Nodes | Transfer once | Registry | +// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) | +// +------------------------+ marked frozen +----------------------+ +// ^ | +// | | +// | Put() +// | Get() on +-------------------+ when +// +-new char--------| builderNode Pool |<-----------evicted +// +-------------------+ type builderNodePool struct { - all [][]builderNode - nextOuter int - nextInner int -} - -func (p *builderNodePool) reset() { - p.nextOuter = 0 - p.nextInner = 0 + head *builderNode } -func (p *builderNodePool) alloc() *builderNode { - if p.nextOuter >= len(p.all) { - p.all = append(p.all, make([]builderNode, 256)) +func (p *builderNodePool) Get() *builderNode { + if p.head == nil { + return &builderNode{} } - rv := &p.all[p.nextOuter][p.nextInner] - p.nextInner += 1 - if p.nextInner >= len(p.all[p.nextOuter]) { - p.nextOuter += 1 - p.nextInner = 0 - } - rv.finalOutput = 0 - rv.trans = rv.trans[:0] - rv.final = false - return rv + head := p.head + p.head = p.head.next + return head } -// the next transition to alloc() will be all[nextOuter][nextInner] -type transitionPool struct { - all [][]transition - nextOuter int - nextInner int -} - -func (p *transitionPool) reset() { - p.nextOuter = 0 - p.nextInner = 0 -} - -func (p *transitionPool) alloc() *transition { - if p.nextOuter >= len(p.all) { - p.all = append(p.all, make([]transition, 256)) - } - rv := &p.all[p.nextOuter][p.nextInner] - p.nextInner += 1 - if p.nextInner >= len(p.all[p.nextOuter]) { - p.nextOuter += 1 - p.nextInner = 0 +func (p *builderNodePool) Put(v *builderNode) { + if v == nil { + return } - *rv = transition{} - return rv + v.reset() + v.next = p.head + p.head = v } diff --git a/vendor/github.com/couchbase/vellum/decoder_v1.go b/vendor/github.com/couchbase/vellum/decoder_v1.go index 5a0ea6887..d56e61db5 100644 --- a/vendor/github.com/couchbase/vellum/decoder_v1.go +++ b/vendor/github.com/couchbase/vellum/decoder_v1.go @@ -29,8 +29,6 @@ func init() { type decoderV1 struct { data []byte - root uint64 - len uint64 } func newDecoderV1(data []byte) *decoderV1 { @@ -219,7 +217,7 @@ func (f *fstStateV1) Final() bool { } func (f *fstStateV1) FinalOutput() uint64 { - if f.numTrans > 0 && f.final && f.outSize > 0 { + if f.final && f.outSize > 0 { return readPackedUint(f.data[f.outFinal : f.outFinal+f.outSize]) } return 0 diff --git a/vendor/github.com/couchbase/vellum/fst.go b/vendor/github.com/couchbase/vellum/fst.go index ecc528395..64ee21a41 100644 --- a/vendor/github.com/couchbase/vellum/fst.go +++ b/vendor/github.com/couchbase/vellum/fst.go @@ -74,8 +74,8 @@ func (f *FST) get(input []byte, prealloc fstState) (uint64, bool, error) { if err != nil { return 0, false, err } - for i := range input { - _, curr, output := state.TransitionFor(input[i]) + for _, c := range input { + _, curr, output := state.TransitionFor(c) if curr == noneAddr { return 0, false, nil } @@ -243,6 +243,52 @@ func (f *FST) Reader() (*Reader, error) { return &Reader{f: f}, nil } +func (f *FST) GetMinKey() ([]byte, error) { + var rv []byte + + curr := f.decoder.getRoot() + state, err := f.decoder.stateAt(curr, nil) + if err != nil { + return nil, err + } + + for !state.Final() { + nextTrans := state.TransitionAt(0) + _, curr, _ = state.TransitionFor(nextTrans) + state, err = f.decoder.stateAt(curr, state) + if err != nil { + return nil, err + } + + rv = append(rv, nextTrans) + } + + return rv, nil +} + +func (f *FST) GetMaxKey() ([]byte, error) { + var rv []byte + + curr := f.decoder.getRoot() + state, err := f.decoder.stateAt(curr, nil) + if err != nil { + return nil, err + } + + for state.NumTransitions() > 0 { + nextTrans := state.TransitionAt(state.NumTransitions() - 1) + _, curr, _ = state.TransitionFor(nextTrans) + state, err = f.decoder.stateAt(curr, state) + if err != nil { + return nil, err + } + + rv = append(rv, nextTrans) + } + + return rv, nil +} + // A Reader is meant for a single threaded use type Reader struct { f *FST diff --git a/vendor/github.com/couchbase/vellum/fst_iterator.go b/vendor/github.com/couchbase/vellum/fst_iterator.go index 389ac64aa..eb731395b 100644 --- a/vendor/github.com/couchbase/vellum/fst_iterator.go +++ b/vendor/github.com/couchbase/vellum/fst_iterator.go @@ -76,7 +76,8 @@ func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, // Reset resets the Iterator' internal state to allow for iterator // reuse (e.g. pooling). -func (i *FSTIterator) Reset(f *FST, startKeyInclusive, endKeyExclusive []byte, aut Automaton) error { +func (i *FSTIterator) Reset(f *FST, + startKeyInclusive, endKeyExclusive []byte, aut Automaton) error { if aut == nil { aut = alwaysMatchAutomaton } @@ -91,14 +92,14 @@ func (i *FSTIterator) Reset(f *FST, startKeyInclusive, endKeyExclusive []byte, a // pointTo attempts to point us to the specified location func (i *FSTIterator) pointTo(key []byte) error { - // tried to seek before start if bytes.Compare(key, i.startKeyInclusive) < 0 { key = i.startKeyInclusive } - // trid to see past end - if i.endKeyExclusive != nil && bytes.Compare(key, i.endKeyExclusive) > 0 { + // tried to see past end + if i.endKeyExclusive != nil && + bytes.Compare(key, i.endKeyExclusive) > 0 { key = i.endKeyExclusive } @@ -121,21 +122,23 @@ func (i *FSTIterator) pointTo(key []byte) error { i.statesStack = append(i.statesStack, root) i.autStatesStack = append(i.autStatesStack, autStart) for j := 0; j < len(key); j++ { + keyJ := key[j] curr := i.statesStack[len(i.statesStack)-1] autCurr := i.autStatesStack[len(i.autStatesStack)-1] - pos, nextAddr, nextVal := curr.TransitionFor(key[j]) + pos, nextAddr, nextVal := curr.TransitionFor(keyJ) if nextAddr == noneAddr { // needed transition doesn't exist // find last trans before the one we needed - for q := 0; q < curr.NumTransitions(); q++ { - if curr.TransitionAt(q) < key[j] { + for q := curr.NumTransitions() - 1; q >= 0; q-- { + if curr.TransitionAt(q) < keyJ { maxQ = q + break } } break } - autNext := i.aut.Accept(autCurr, key[j]) + autNext := i.aut.Accept(autCurr, keyJ) next, err := i.f.decoder.stateAt(nextAddr, nil) if err != nil { @@ -143,14 +146,16 @@ func (i *FSTIterator) pointTo(key []byte) error { } i.statesStack = append(i.statesStack, next) - i.keysStack = append(i.keysStack, key[j]) + i.keysStack = append(i.keysStack, keyJ) i.keysPosStack = append(i.keysPosStack, pos) i.valsStack = append(i.valsStack, nextVal) i.autStatesStack = append(i.autStatesStack, autNext) continue } - if !i.statesStack[len(i.statesStack)-1].Final() || !i.aut.IsMatch(i.autStatesStack[len(i.autStatesStack)-1]) || bytes.Compare(i.keysStack, key) < 0 { + if !i.statesStack[len(i.statesStack)-1].Final() || + !i.aut.IsMatch(i.autStatesStack[len(i.autStatesStack)-1]) || + bytes.Compare(i.keysStack, key) < 0 { return i.next(maxQ) } @@ -181,15 +186,12 @@ func (i *FSTIterator) Next() error { } func (i *FSTIterator) next(lastOffset int) error { - // remember where we started - if cap(i.nextStart) < len(i.keysStack) { - i.nextStart = make([]byte, len(i.keysStack)) - } else { - i.nextStart = i.nextStart[0:len(i.keysStack)] - } - copy(i.nextStart, i.keysStack) + i.nextStart = append(i.nextStart[:0], i.keysStack...) + nextOffset := lastOffset + 1 + +OUTER: for true { curr := i.statesStack[len(i.statesStack)-1] autCurr := i.autStatesStack[len(i.autStatesStack)-1] @@ -200,58 +202,62 @@ func (i *FSTIterator) next(lastOffset int) error { return nil } - nextOffset := lastOffset + 1 - if nextOffset < curr.NumTransitions() { + numTrans := curr.NumTransitions() + + INNER: + for nextOffset < numTrans { t := curr.TransitionAt(nextOffset) autNext := i.aut.Accept(autCurr, t) - if i.aut.CanMatch(autNext) { - pos, nextAddr, v := curr.TransitionFor(t) - - // the next slot in the statesStack might have an - // fstState instance that we can reuse - var nextPrealloc fstState - if len(i.statesStack) < cap(i.statesStack) { - nextPrealloc = i.statesStack[0:cap(i.statesStack)][len(i.statesStack)] - } + if !i.aut.CanMatch(autNext) { + nextOffset += 1 + continue INNER + } - // push onto stack - next, err := i.f.decoder.stateAt(nextAddr, nextPrealloc) - if err != nil { - return err - } - i.statesStack = append(i.statesStack, next) - i.keysStack = append(i.keysStack, t) - i.keysPosStack = append(i.keysPosStack, pos) - i.valsStack = append(i.valsStack, v) - i.autStatesStack = append(i.autStatesStack, autNext) - lastOffset = -1 - - // check to see if new keystack might have gone too far - if i.endKeyExclusive != nil && bytes.Compare(i.keysStack, i.endKeyExclusive) >= 0 { - return ErrIteratorDone - } - } else { - lastOffset = nextOffset + pos, nextAddr, v := curr.TransitionFor(t) + + // the next slot in the statesStack might have an + // fstState instance that we can reuse + var nextPrealloc fstState + if len(i.statesStack) < cap(i.statesStack) { + nextPrealloc = i.statesStack[0:cap(i.statesStack)][len(i.statesStack)] } - continue + // push onto stack + next, err := i.f.decoder.stateAt(nextAddr, nextPrealloc) + if err != nil { + return err + } + + i.statesStack = append(i.statesStack, next) + i.keysStack = append(i.keysStack, t) + i.keysPosStack = append(i.keysPosStack, pos) + i.valsStack = append(i.valsStack, v) + i.autStatesStack = append(i.autStatesStack, autNext) + + // check to see if new keystack might have gone too far + if i.endKeyExclusive != nil && + bytes.Compare(i.keysStack, i.endKeyExclusive) >= 0 { + return ErrIteratorDone + } + + nextOffset = 0 + continue OUTER } - if len(i.statesStack) > 1 { - // no transitions, and still room to pop - i.statesStack = i.statesStack[:len(i.statesStack)-1] - i.keysStack = i.keysStack[:len(i.keysStack)-1] - lastOffset = i.keysPosStack[len(i.keysPosStack)-1] - - i.keysPosStack = i.keysPosStack[:len(i.keysPosStack)-1] - i.valsStack = i.valsStack[:len(i.valsStack)-1] - i.autStatesStack = i.autStatesStack[:len(i.autStatesStack)-1] - continue - } else { + if len(i.statesStack) <= 1 { // stack len is 1 (root), can't go back further, we're done break } + // no transitions, and still room to pop + i.statesStack = i.statesStack[:len(i.statesStack)-1] + i.keysStack = i.keysStack[:len(i.keysStack)-1] + + nextOffset = i.keysPosStack[len(i.keysPosStack)-1] + 1 + + i.keysPosStack = i.keysPosStack[:len(i.keysPosStack)-1] + i.valsStack = i.valsStack[:len(i.valsStack)-1] + i.autStatesStack = i.autStatesStack[:len(i.autStatesStack)-1] } return ErrIteratorDone @@ -262,15 +268,12 @@ func (i *FSTIterator) next(lastOffset int) error { // seek operation would go past the last key, or outside the configured // startKeyInclusive/endKeyExclusive then ErrIteratorDone is returned. func (i *FSTIterator) Seek(key []byte) error { - err := i.pointTo(key) - if err != nil { - return err - } - return nil + return i.pointTo(key) } // Close will free any resources held by this iterator. func (i *FSTIterator) Close() error { - // at the moment we don't do anything, but wanted this for API completeness + // at the moment we don't do anything, + // but wanted this for API completeness return nil } diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE b/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE new file mode 100644 index 000000000..6b0b1270f --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go new file mode 100644 index 000000000..4bf64fef2 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go @@ -0,0 +1,125 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "fmt" + "sort" + "unicode/utf8" +) + +type FullCharacteristicVector []uint32 + +func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 { + bucketID := offset / 32 + align := offset - bucketID*32 + if align == 0 { + return fcv[bucketID] & mask + } + left := fcv[bucketID] >> align + right := fcv[bucketID+1] << (32 - align) + return (left | right) & mask +} + +type tuple struct { + char rune + fcv FullCharacteristicVector +} + +type sortRunes []rune + +func (s sortRunes) Less(i, j int) bool { + return s[i] < s[j] +} + +func (s sortRunes) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s sortRunes) Len() int { + return len(s) +} + +func sortRune(r []rune) []rune { + sort.Sort(sortRunes(r)) + return r +} + +type Alphabet struct { + charset []tuple + index uint32 +} + +func (a *Alphabet) resetNext() { + a.index = 0 +} + +func (a *Alphabet) next() (rune, FullCharacteristicVector, error) { + if int(a.index) >= len(a.charset) { + return 0, nil, fmt.Errorf("eof") + } + + rv := a.charset[a.index] + a.index++ + return rv.char, rv.fcv, nil +} + +func dedupe(in string) string { + lookUp := make(map[rune]struct{}, len(in)) + var rv string + for len(in) > 0 { + r, size := utf8.DecodeRuneInString(in) + in = in[size:] + if _, ok := lookUp[r]; !ok { + rv += string(r) + lookUp[r] = struct{}{} + } + } + return rv +} + +func queryChars(qChars string) Alphabet { + chars := dedupe(qChars) + inChars := sortRune([]rune(chars)) + charsets := make([]tuple, 0, len(inChars)) + + for _, c := range inChars { + tempChars := qChars + var bits []uint32 + for len(tempChars) > 0 { + var chunk string + if len(tempChars) > 32 { + chunk = tempChars[0:32] + tempChars = tempChars[32:] + } else { + chunk = tempChars + tempChars = tempChars[:0] + } + + chunkBits := uint32(0) + bit := uint32(1) + for _, chr := range chunk { + if chr == c { + chunkBits |= bit + } + bit <<= 1 + } + bits = append(bits, chunkBits) + } + bits = append(bits, 0) + charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)}) + } + return Alphabet{charset: charsets} +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go new file mode 100644 index 000000000..e82a780a5 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go @@ -0,0 +1,250 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "fmt" + "math" +) + +const SinkState = uint32(0) + +type DFA struct { + transitions [][256]uint32 + distances []Distance + initState int + ed uint8 +} + +/// Returns the initial state +func (d *DFA) initialState() int { + return d.initState +} + +/// Returns the Levenshtein distance associated to the +/// current state. +func (d *DFA) distance(stateId int) Distance { + return d.distances[stateId] +} + +/// Returns the number of states in the `DFA`. +func (d *DFA) numStates() int { + return len(d.transitions) +} + +/// Returns the destination state reached after consuming a given byte. +func (d *DFA) transition(fromState int, b uint8) int { + return int(d.transitions[fromState][b]) +} + +func (d *DFA) eval(bytes []uint8) Distance { + state := d.initialState() + + for _, b := range bytes { + state = d.transition(state, b) + } + + return d.distance(state) +} + +func (d *DFA) Start() int { + return int(d.initialState()) +} + +func (d *DFA) IsMatch(state int) bool { + if _, ok := d.distance(state).(Exact); ok { + return true + } + return false +} + +func (d *DFA) CanMatch(state int) bool { + return state > 0 && state < d.numStates() +} + +func (d *DFA) Accept(state int, b byte) int { + return int(d.transition(state, b)) +} + +// WillAlwaysMatch returns if the specified state will always end in a +// matching state. +func (d *DFA) WillAlwaysMatch(state int) bool { + return false +} + +func fill(dest []uint32, val uint32) { + for i := range dest { + dest[i] = val + } +} + +func fillTransitions(dest *[256]uint32, val uint32) { + for i := range dest { + dest[i] = val + } +} + +type Utf8DFAStateBuilder struct { + dfaBuilder *Utf8DFABuilder + stateID uint32 + defaultSuccessor []uint32 +} + +func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8, + toStateID uint32) { + sb.dfaBuilder.transitions[fromStateID][b] = toStateID +} + +func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) { + fromStateID := sb.stateID + chars := []byte(string(in)) + lastByte := chars[len(chars)-1] + + for i, ch := range chars[:len(chars)-1] { + remNumBytes := len(chars) - i - 1 + defaultSuccessor := sb.defaultSuccessor[remNumBytes] + intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch] + + if intermediateStateID == defaultSuccessor { + intermediateStateID = sb.dfaBuilder.allocate() + fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID], + sb.defaultSuccessor[remNumBytes-1]) + } + + sb.addTransitionID(fromStateID, ch, intermediateStateID) + fromStateID = intermediateStateID + } + + toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID)) + sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded) +} + +type Utf8StateId uint32 + +func original(stateId uint32) Utf8StateId { + return predecessor(stateId, 0) +} + +func predecessor(stateId uint32, numSteps uint8) Utf8StateId { + return Utf8StateId(stateId*4 + uint32(numSteps)) +} + +// Utf8DFABuilder makes it possible to define a DFA +// that takes unicode character, and build a `DFA` +// that operates on utf-8 encoded +type Utf8DFABuilder struct { + index []uint32 + distances []Distance + transitions [][256]uint32 + initialState uint32 + numStates uint32 + maxNumStates uint32 +} + +func withMaxStates(maxStates uint32) *Utf8DFABuilder { + rv := &Utf8DFABuilder{ + index: make([]uint32, maxStates*2+100), + distances: make([]Distance, 0, maxStates), + transitions: make([][256]uint32, 0, maxStates), + maxNumStates: maxStates, + } + + for i := range rv.index { + rv.index[i] = math.MaxUint32 + } + + return rv +} + +func (dfab *Utf8DFABuilder) allocate() uint32 { + newState := dfab.numStates + dfab.numStates++ + + dfab.distances = append(dfab.distances, Atleast{d: 255}) + dfab.transitions = append(dfab.transitions, [256]uint32{}) + + return newState +} + +func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 { + if int(state) >= cap(dfab.index) { + cloneIndex := make([]uint32, int(state)*2) + copy(cloneIndex, dfab.index) + dfab.index = cloneIndex + } + if dfab.index[state] != math.MaxUint32 { + return dfab.index[state] + } + + nstate := dfab.allocate() + dfab.index[state] = nstate + + return nstate +} + +func (dfab *Utf8DFABuilder) setInitialState(iState uint32) { + decodedID := dfab.getOrAllocate(original(iState)) + dfab.initialState = decodedID +} + +func (dfab *Utf8DFABuilder) build(ed uint8) *DFA { + return &DFA{ + transitions: dfab.transitions, + distances: dfab.distances, + initState: int(dfab.initialState), + ed: ed, + } +} + +func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32, + distance Distance) (*Utf8DFAStateBuilder, error) { + if state > dfab.maxNumStates { + return nil, fmt.Errorf("State id is larger than maxNumStates") + } + + stateID := dfab.getOrAllocate(original(state)) + dfab.distances[stateID] = distance + + defaultSuccID := dfab.getOrAllocate(original(default_suc_orig)) + // creates a chain of states of predecessors of `default_suc_orig`. + // Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]` + // leads to the `default_suc_orig` state. + predecessorStates := []uint32{defaultSuccID, + defaultSuccID, + defaultSuccID, + defaultSuccID} + + for numBytes := uint8(1); numBytes < 4; numBytes++ { + predecessorState := predecessor(default_suc_orig, numBytes) + predecessorStateID := dfab.getOrAllocate(predecessorState) + predecessorStates[numBytes] = predecessorStateID + succ := predecessorStates[numBytes-1] + fillTransitions(&dfab.transitions[predecessorStateID], succ) + } + + // 1-byte encoded chars. + fill(dfab.transitions[stateID][0:192], predecessorStates[0]) + // 2-bytes encoded chars. + fill(dfab.transitions[stateID][192:224], predecessorStates[1]) + // 3-bytes encoded chars. + fill(dfab.transitions[stateID][224:240], predecessorStates[2]) + // 4-bytes encoded chars. + fill(dfab.transitions[stateID][240:256], predecessorStates[3]) + + return &Utf8DFAStateBuilder{ + dfaBuilder: dfab, + stateID: stateID, + defaultSuccessor: predecessorStates}, nil +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go new file mode 100644 index 000000000..1ca0aaa65 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go @@ -0,0 +1,64 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import "fmt" + +// StateLimit is the maximum number of states allowed +const StateLimit = 10000 + +// ErrTooManyStates is returned if you attempt to build a Levenshtein +// automaton which requires too many states. +var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", + StateLimit) + +// LevenshteinAutomatonBuilder wraps a precomputed +// datastructure that allows to produce small (but not minimal) DFA. +type LevenshteinAutomatonBuilder struct { + pDfa *ParametricDFA +} + +// NewLevenshteinAutomatonBuilder creates a +// reusable, threadsafe Levenshtein automaton builder. +// `maxDistance` - maximum distance considered by the automaton. +// `transposition` - assign a distance of 1 for transposition +// +// Building this automaton builder is computationally intensive. +// While it takes only a few milliseconds for `d=2`, it grows +// exponentially with `d`. It is only reasonable to `d <= 5`. +func NewLevenshteinAutomatonBuilder(maxDistance uint8, + transposition bool) (*LevenshteinAutomatonBuilder, error) { + lnfa := newLevenshtein(maxDistance, transposition) + + pdfa, err := fromNfa(lnfa) + if err != nil { + return nil, err + } + + return &LevenshteinAutomatonBuilder{pDfa: pdfa}, nil +} + +// BuildDfa builds the levenshtein automaton for serving +// queries with a given edit distance. +func (lab *LevenshteinAutomatonBuilder) BuildDfa(query string, + fuzziness uint8) (*DFA, error) { + return lab.pDfa.buildDfa(query, fuzziness, false) +} + +// MaxDistance returns the MaxEdit distance supported by the +// LevenshteinAutomatonBuilder builder. +func (lab *LevenshteinAutomatonBuilder) MaxDistance() uint8 { + return lab.pDfa.maxDistance +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go new file mode 100644 index 000000000..bed9b99d5 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go @@ -0,0 +1,292 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "math" + "sort" +) + +/// Levenshtein Distance computed by a Levenshtein Automaton. +/// +/// Levenshtein automata can only compute the exact Levenshtein distance +/// up to a given `max_distance`. +/// +/// Over this distance, the automaton will invariably +/// return `Distance::AtLeast(max_distance + 1)`. +type Distance interface { + distance() uint8 +} + +type Exact struct { + d uint8 +} + +func (e Exact) distance() uint8 { + return e.d +} + +type Atleast struct { + d uint8 +} + +func (a Atleast) distance() uint8 { + return a.d +} + +func characteristicVector(query []rune, c rune) uint64 { + chi := uint64(0) + for i := 0; i < len(query); i++ { + if query[i] == c { + chi |= 1 << uint64(i) + } + } + return chi +} + +type NFAState struct { + Offset uint32 + Distance uint8 + InTranspose bool +} + +type NFAStates []NFAState + +func (ns NFAStates) Len() int { + return len(ns) +} + +func (ns NFAStates) Less(i, j int) bool { + if ns[i].Offset != ns[j].Offset { + return ns[i].Offset < ns[j].Offset + } + + if ns[i].Distance != ns[j].Distance { + return ns[i].Distance < ns[j].Distance + } + + return !ns[i].InTranspose && ns[j].InTranspose +} + +func (ns NFAStates) Swap(i, j int) { + ns[i], ns[j] = ns[j], ns[i] +} + +func (ns *NFAState) imply(other NFAState) bool { + transposeImply := ns.InTranspose + if !other.InTranspose { + transposeImply = !other.InTranspose + } + + deltaOffset := ns.Offset - other.Offset + if ns.Offset < other.Offset { + deltaOffset = other.Offset - ns.Offset + } + + if transposeImply { + return uint32(other.Distance) >= (uint32(ns.Distance) + deltaOffset) + } + + return uint32(other.Distance) > (uint32(ns.Distance) + deltaOffset) +} + +type MultiState struct { + states []NFAState +} + +func (ms *MultiState) States() []NFAState { + return ms.states +} + +func (ms *MultiState) Clear() { + ms.states = ms.states[:0] +} + +func newMultiState() *MultiState { + return &MultiState{states: make([]NFAState, 0)} +} + +func (ms *MultiState) normalize() uint32 { + minOffset := uint32(math.MaxUint32) + + for _, s := range ms.states { + if s.Offset < minOffset { + minOffset = s.Offset + } + } + if minOffset == uint32(math.MaxUint32) { + minOffset = 0 + } + + for i := 0; i < len(ms.states); i++ { + ms.states[i].Offset -= minOffset + } + + sort.Sort(NFAStates(ms.states)) + + return minOffset +} + +func (ms *MultiState) addStates(nState NFAState) { + + for _, s := range ms.states { + if s.imply(nState) { + return + } + } + + i := 0 + for i < len(ms.states) { + if nState.imply(ms.states[i]) { + ms.states = append(ms.states[:i], ms.states[i+1:]...) + } else { + i++ + } + } + ms.states = append(ms.states, nState) + +} + +func extractBit(bitset uint64, pos uint8) bool { + shift := bitset >> pos + bit := shift & 1 + return bit == uint64(1) +} + +func dist(left, right uint32) uint32 { + if left > right { + return left - right + } + return right - left +} + +type LevenshteinNFA struct { + mDistance uint8 + damerau bool +} + +func newLevenshtein(maxD uint8, transposition bool) *LevenshteinNFA { + return &LevenshteinNFA{mDistance: maxD, + damerau: transposition, + } +} + +func (la *LevenshteinNFA) maxDistance() uint8 { + return la.mDistance +} + +func (la *LevenshteinNFA) msDiameter() uint8 { + return 2*la.mDistance + 1 +} + +func (la *LevenshteinNFA) initialStates() *MultiState { + ms := MultiState{} + nfaState := NFAState{} + ms.addStates(nfaState) + return &ms +} + +func (la *LevenshteinNFA) multistateDistance(ms *MultiState, + queryLen uint32) Distance { + minDistance := Atleast{d: la.mDistance + 1} + for _, s := range ms.states { + t := s.Distance + uint8(dist(queryLen, s.Offset)) + if t <= uint8(la.mDistance) { + if minDistance.distance() > t { + minDistance.d = t + } + } + } + + if minDistance.distance() == la.mDistance+1 { + return Atleast{d: la.mDistance + 1} + } + + return minDistance +} + +func (la *LevenshteinNFA) simpleTransition(state NFAState, + symbol uint64, ms *MultiState) { + + if state.Distance < la.mDistance { + // insertion + ms.addStates(NFAState{Offset: state.Offset, + Distance: state.Distance + 1, + InTranspose: false}) + + // substitution + ms.addStates(NFAState{Offset: state.Offset + 1, + Distance: state.Distance + 1, + InTranspose: false}) + + n := la.mDistance + 1 - state.Distance + for d := uint8(1); d < n; d++ { + if extractBit(symbol, d) { + // for d > 0, as many deletion and character match + ms.addStates(NFAState{Offset: state.Offset + 1 + uint32(d), + Distance: state.Distance + d, + InTranspose: false}) + } + } + + if la.damerau && extractBit(symbol, 1) { + ms.addStates(NFAState{ + Offset: state.Offset, + Distance: state.Distance + 1, + InTranspose: true}) + } + + } + + if extractBit(symbol, 0) { + ms.addStates(NFAState{Offset: state.Offset + 1, + Distance: state.Distance, + InTranspose: false}) + } + + if state.InTranspose && extractBit(symbol, 0) { + ms.addStates(NFAState{Offset: state.Offset + 2, + Distance: state.Distance, + InTranspose: false}) + } + +} + +func (la *LevenshteinNFA) transition(cState *MultiState, + dState *MultiState, scv uint64) { + dState.Clear() + mask := (uint64(1) << la.msDiameter()) - uint64(1) + + for _, state := range cState.states { + cv := (scv >> state.Offset) & mask + la.simpleTransition(state, cv, dState) + } + + sort.Sort(NFAStates(dState.states)) +} + +func (la *LevenshteinNFA) computeDistance(query, other []rune) Distance { + cState := la.initialStates() + nState := newMultiState() + + for _, i := range other { + nState.Clear() + chi := characteristicVector(query, i) + la.transition(cState, nState, chi) + cState, nState = nState, cState + } + + return la.multistateDistance(cState, uint32(len(query))) +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go new file mode 100644 index 000000000..ebd931195 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go @@ -0,0 +1,349 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "crypto/md5" + "encoding/json" + "fmt" + "math" +) + +type ParametricState struct { + shapeID uint32 + offset uint32 +} + +func newParametricState() ParametricState { + return ParametricState{} +} + +func (ps *ParametricState) isDeadEnd() bool { + return ps.shapeID == 0 +} + +type Transition struct { + destShapeID uint32 + deltaOffset uint32 +} + +func (t *Transition) apply(state ParametricState) ParametricState { + ps := ParametricState{ + shapeID: t.destShapeID} + // don't need any offset if we are in the dead state, + // this ensures we have only one dead state. + if t.destShapeID != 0 { + ps.offset = state.offset + t.deltaOffset + } + + return ps +} + +type ParametricStateIndex struct { + stateIndex []uint32 + stateQueue []ParametricState + numOffsets uint32 +} + +func newParametricStateIndex(queryLen, + numParamState uint32) ParametricStateIndex { + numOffsets := queryLen + 1 + if numParamState == 0 { + numParamState = numOffsets + } + maxNumStates := numParamState * numOffsets + psi := ParametricStateIndex{ + stateIndex: make([]uint32, maxNumStates), + stateQueue: make([]ParametricState, 0, 150), + numOffsets: numOffsets, + } + + for i := uint32(0); i < maxNumStates; i++ { + psi.stateIndex[i] = math.MaxUint32 + } + return psi +} + +func (psi *ParametricStateIndex) numStates() int { + return len(psi.stateQueue) +} + +func (psi *ParametricStateIndex) maxNumStates() int { + return len(psi.stateIndex) +} + +func (psi *ParametricStateIndex) get(stateID uint32) ParametricState { + return psi.stateQueue[stateID] +} + +func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 { + bucket := ps.shapeID*psi.numOffsets + ps.offset + if bucket < uint32(len(psi.stateIndex)) && + psi.stateIndex[bucket] != math.MaxUint32 { + return psi.stateIndex[bucket] + } + nState := uint32(len(psi.stateQueue)) + psi.stateQueue = append(psi.stateQueue, ps) + + psi.stateIndex[bucket] = nState + return nState +} + +type ParametricDFA struct { + distance []uint8 + transitions []Transition + maxDistance uint8 + transitionStride uint32 + diameter uint32 +} + +func (pdfa *ParametricDFA) initialState() ParametricState { + return ParametricState{shapeID: 1} +} + +// Returns true iff whatever characters come afterward, +// we will never reach a shorter distance +func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool { + if state.isDeadEnd() { + return true + } + + remOffset := queryLen - state.offset + if remOffset < pdfa.diameter { + stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:] + prefixDistance := stateDistances[remOffset] + if prefixDistance > pdfa.maxDistance { + return false + } + + for _, d := range stateDistances { + if d < prefixDistance { + return false + } + } + return true + } + return false +} + +func (pdfa *ParametricDFA) numStates() int { + return len(pdfa.transitions) / int(pdfa.transitionStride) +} + +func min(x, y uint32) uint32 { + if x < y { + return x + } + return y +} + +func (pdfa *ParametricDFA) transition(state ParametricState, + chi uint32) Transition { + return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi] +} + +func (pdfa *ParametricDFA) getDistance(state ParametricState, + qLen uint32) Distance { + remainingOffset := qLen - state.offset + if state.isDeadEnd() || remainingOffset >= pdfa.diameter { + return Atleast{d: pdfa.maxDistance + 1} + } + dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)] + if dist > pdfa.maxDistance { + return Atleast{d: dist} + } + return Exact{d: dist} +} + +func (pdfa *ParametricDFA) computeDistance(left, right string) Distance { + state := pdfa.initialState() + leftChars := []rune(left) + for _, chr := range []rune(right) { + start := state.offset + stop := min(start+pdfa.diameter, uint32(len(leftChars))) + chi := characteristicVector(leftChars[start:stop], chr) + transition := pdfa.transition(state, uint32(chi)) + state = transition.apply(state) + if state.isDeadEnd() { + return Atleast{d: pdfa.maxDistance + 1} + } + } + return pdfa.getDistance(state, uint32(len(left))) +} + +func (pdfa *ParametricDFA) buildDfa(query string, distance uint8, + prefix bool) (*DFA, error) { + qLen := uint32(len([]rune(query))) + alphabet := queryChars(query) + + psi := newParametricStateIndex(qLen, uint32(pdfa.numStates())) + maxNumStates := psi.maxNumStates() + deadEndStateID := psi.getOrAllocate(newParametricState()) + if deadEndStateID != 0 { + return nil, fmt.Errorf("Invalid dead end state") + } + + initialStateID := psi.getOrAllocate(pdfa.initialState()) + dfaBuilder := withMaxStates(uint32(maxNumStates)) + mask := uint32((1 << pdfa.diameter) - 1) + + var stateID int + for stateID = 0; stateID < StateLimit; stateID++ { + if stateID == psi.numStates() { + break + } + state := psi.get(uint32(stateID)) + if prefix && pdfa.isPrefixSink(state, qLen) { + distance := pdfa.getDistance(state, qLen) + dfaBuilder.addState(uint32(stateID), uint32(stateID), distance) + } else { + transition := pdfa.transition(state, 0) + defSuccessor := transition.apply(state) + defSuccessorID := psi.getOrAllocate(defSuccessor) + distance := pdfa.getDistance(state, qLen) + stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance) + + if err != nil { + return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err) + } + + alphabet.resetNext() + chr, cv, err := alphabet.next() + for err == nil { + chi := cv.shiftAndMask(state.offset, mask) + + transition := pdfa.transition(state, chi) + + destState := transition.apply(state) + + destStateID := psi.getOrAllocate(destState) + + stateBuilder.addTransition(chr, destStateID) + + chr, cv, err = alphabet.next() + } + } + } + + if stateID == StateLimit { + return nil, ErrTooManyStates + } + + dfaBuilder.setInitialState(initialStateID) + return dfaBuilder.build(distance), nil +} + +func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) { + lookUp := newHash() + lookUp.getOrAllocate(*newMultiState()) + initialState := nfa.initialStates() + lookUp.getOrAllocate(*initialState) + + maxDistance := nfa.maxDistance() + msDiameter := nfa.msDiameter() + + numChi := 1 << msDiameter + chiValues := make([]uint64, numChi) + for i := 0; i < numChi; i++ { + chiValues[i] = uint64(i) + } + + transitions := make([]Transition, 0, numChi*int(msDiameter)) + var stateID int + for stateID = 0; stateID < StateLimit; stateID++ { + if stateID == len(lookUp.items) { + break + } + + for _, chi := range chiValues { + destMs := newMultiState() + + ms := lookUp.getFromID(stateID) + + nfa.transition(ms, destMs, chi) + + translation := destMs.normalize() + + destID := lookUp.getOrAllocate(*destMs) + + transitions = append(transitions, Transition{ + destShapeID: uint32(destID), + deltaOffset: translation, + }) + } + } + + if stateID == StateLimit { + return nil, ErrTooManyStates + } + + ns := len(lookUp.items) + diameter := int(msDiameter) + + distances := make([]uint8, 0, diameter*ns) + for stateID := 0; stateID < ns; stateID++ { + ms := lookUp.getFromID(stateID) + for offset := 0; offset < diameter; offset++ { + dist := nfa.multistateDistance(ms, uint32(offset)) + distances = append(distances, dist.distance()) + } + } + + return &ParametricDFA{ + diameter: uint32(msDiameter), + transitions: transitions, + maxDistance: maxDistance, + transitionStride: uint32(numChi), + distance: distances, + }, nil +} + +type hash struct { + index map[[16]byte]int + items []MultiState +} + +func newHash() *hash { + return &hash{ + index: make(map[[16]byte]int, 100), + items: make([]MultiState, 0, 100), + } +} + +func (h *hash) getOrAllocate(m MultiState) int { + size := len(h.items) + var exists bool + var pos int + md5 := getHash(&m) + if pos, exists = h.index[md5]; !exists { + h.index[md5] = size + pos = size + h.items = append(h.items, m) + } + return pos +} + +func (h *hash) getFromID(id int) *MultiState { + return &h.items[id] +} + +func getHash(ms *MultiState) [16]byte { + msBytes := []byte{} + for _, state := range ms.states { + jsonBytes, _ := json.Marshal(&state) + msBytes = append(msBytes, jsonBytes...) + } + return md5.Sum(msBytes) +} diff --git a/vendor/github.com/couchbase/vellum/regexp/compile.go b/vendor/github.com/couchbase/vellum/regexp/compile.go index 6922b749d..55280164c 100644 --- a/vendor/github.com/couchbase/vellum/regexp/compile.go +++ b/vendor/github.com/couchbase/vellum/regexp/compile.go @@ -18,17 +18,27 @@ import ( "regexp/syntax" "unicode" + unicode_utf8 "unicode/utf8" + "github.com/couchbase/vellum/utf8" ) type compiler struct { sizeLimit uint insts prog + instsPool []inst + + sequences utf8.Sequences + rangeStack utf8.RangeStack + startBytes []byte + endBytes []byte } func newCompiler(sizeLimit uint) *compiler { return &compiler{ - sizeLimit: sizeLimit, + sizeLimit: sizeLimit, + startBytes: make([]byte, unicode_utf8.UTFMax), + endBytes: make([]byte, unicode_utf8.UTFMax), } } @@ -37,13 +47,13 @@ func (c *compiler) compile(ast *syntax.Regexp) (prog, error) { if err != nil { return nil, err } - c.insts = append(c.insts, &inst{ - op: OpMatch, - }) + inst := c.allocInst() + inst.op = OpMatch + c.insts = append(c.insts, inst) return c.insts, nil } -func (c *compiler) c(ast *syntax.Regexp) error { +func (c *compiler) c(ast *syntax.Regexp) (err error) { if ast.Flags&syntax.NonGreedy > 1 { return ErrNoLazy } @@ -67,11 +77,12 @@ func (c *compiler) c(ast *syntax.Regexp) error { next.Rune = next.Rune0[0:2] return c.c(&next) } - seqs, err := utf8.NewSequences(r, r) + c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc( + r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes) if err != nil { return err } - for _, seq := range seqs { + for _, seq := range c.sequences { c.compileUtf8Ranges(seq) } } @@ -106,8 +117,7 @@ func (c *compiler) c(ast *syntax.Regexp) error { if len(ast.Sub) == 0 { return nil } - jmpsToEnd := []uint{} - + jmpsToEnd := make([]uint, 0, len(ast.Sub)-1) // does not handle last entry for i := 0; i < len(ast.Sub)-1; i++ { sub := ast.Sub[i] @@ -188,7 +198,8 @@ func (c *compiler) c(ast *syntax.Regexp) error { return err } } - var splits, starts []uint + splits := make([]uint, 0, ast.Max-ast.Min) + starts := make([]uint, 0, ast.Max-ast.Min) for i := ast.Min; i < ast.Max; i++ { splits = append(splits, c.emptySplit()) starts = append(starts, uint(len(c.insts))) @@ -218,8 +229,7 @@ func (c *compiler) compileClass(ast *syntax.Regexp) error { if len(ast.Rune) == 0 { return nil } - var jmps []uint - + jmps := make([]uint, 0, len(ast.Rune)-2) // does not do last pair for i := 0; i < len(ast.Rune)-2; i += 2 { rstart := ast.Rune[i] @@ -249,16 +259,16 @@ func (c *compiler) compileClass(ast *syntax.Regexp) error { return nil } -func (c *compiler) compileClassRange(startR, endR rune) error { - seqs, err := utf8.NewSequences(startR, endR) +func (c *compiler) compileClassRange(startR, endR rune) (err error) { + c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc( + startR, endR, c.sequences, c.rangeStack, c.startBytes, c.endBytes) if err != nil { return err } - var jmps []uint - + jmps := make([]uint, 0, len(c.sequences)-1) // does not do last entry - for i := 0; i < len(seqs)-1; i++ { - seq := seqs[i] + for i := 0; i < len(c.sequences)-1; i++ { + seq := c.sequences[i] split := c.emptySplit() j1 := c.top() c.compileUtf8Ranges(seq) @@ -267,7 +277,7 @@ func (c *compiler) compileClassRange(startR, endR rune) error { c.setSplit(split, j1, j2) } // handle last entry - c.compileUtf8Ranges(seqs[len(seqs)-1]) + c.compileUtf8Ranges(c.sequences[len(c.sequences)-1]) end := c.top() for _, jmp := range jmps { c.setJump(jmp, end) @@ -278,25 +288,25 @@ func (c *compiler) compileClassRange(startR, endR rune) error { func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) { for _, r := range seq { - c.insts = append(c.insts, &inst{ - op: OpRange, - rangeStart: r.Start, - rangeEnd: r.End, - }) + inst := c.allocInst() + inst.op = OpRange + inst.rangeStart = r.Start + inst.rangeEnd = r.End + c.insts = append(c.insts, inst) } } func (c *compiler) emptySplit() uint { - c.insts = append(c.insts, &inst{ - op: OpSplit, - }) + inst := c.allocInst() + inst.op = OpSplit + c.insts = append(c.insts, inst) return c.top() - 1 } func (c *compiler) emptyJump() uint { - c.insts = append(c.insts, &inst{ - op: OpJmp, - }) + inst := c.allocInst() + inst.op = OpJmp + c.insts = append(c.insts, inst) return c.top() - 1 } @@ -314,3 +324,12 @@ func (c *compiler) setJump(i, pc uint) { func (c *compiler) top() uint { return uint(len(c.insts)) } + +func (c *compiler) allocInst() *inst { + if len(c.instsPool) <= 0 { + c.instsPool = make([]inst, 16) + } + inst := &c.instsPool[0] + c.instsPool = c.instsPool[1:] + return inst +} diff --git a/vendor/github.com/couchbase/vellum/regexp/dfa.go b/vendor/github.com/couchbase/vellum/regexp/dfa.go index 9864606b6..7e6fb29da 100644 --- a/vendor/github.com/couchbase/vellum/regexp/dfa.go +++ b/vendor/github.com/couchbase/vellum/regexp/dfa.go @@ -23,7 +23,7 @@ import ( const StateLimit = 10000 // ErrTooManyStates is returned if you attempt to build a Levenshtein -// automaton which requries too many states. +// automaton which requires too many states. var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", StateLimit) @@ -37,12 +37,12 @@ func newDfaBuilder(insts prog) *dfaBuilder { d := &dfaBuilder{ dfa: &dfa{ insts: insts, - states: make([]*state, 0, 16), + states: make([]state, 0, 16), }, cache: make(map[string]int, 1024), } // add 0 state that is invalid - d.dfa.states = append(d.dfa.states, &state{ + d.dfa.states = append(d.dfa.states, state{ next: make([]int, 256), match: false, }) @@ -54,13 +54,15 @@ func (d *dfaBuilder) build() (*dfa, error) { next := newSparseSet(uint(len(d.dfa.insts))) d.dfa.add(cur, 0) - states := intStack{d.cachedState(cur)} + ns, instsReuse := d.cachedState(cur, nil) + states := intStack{ns} seen := make(map[int]struct{}) var s int states, s = states.Pop() for s != 0 { for b := 0; b < 256; b++ { - ns := d.runState(cur, next, s, byte(b)) + var ns int + ns, instsReuse = d.runState(cur, next, s, byte(b), instsReuse) if ns != 0 { if _, ok := seen[ns]; !ok { seen[ns] = struct{}{} @@ -76,15 +78,17 @@ func (d *dfaBuilder) build() (*dfa, error) { return d.dfa, nil } -func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte) int { +func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte, instsReuse []uint) ( + int, []uint) { cur.Clear() for _, ip := range d.dfa.states[state].insts { cur.Add(ip) } d.dfa.run(cur, next, b) - nextState := d.cachedState(next) + var nextState int + nextState, instsReuse = d.cachedState(next, instsReuse) d.dfa.states[state].next[b] = nextState - return nextState + return nextState, instsReuse } func instsKey(insts []uint, buf []byte) []byte { @@ -99,8 +103,12 @@ func instsKey(insts []uint, buf []byte) []byte { return buf } -func (d *dfaBuilder) cachedState(set *sparseSet) int { - var insts []uint +func (d *dfaBuilder) cachedState(set *sparseSet, + instsReuse []uint) (int, []uint) { + insts := instsReuse[:0] + if cap(insts) == 0 { + insts = make([]uint, 0, set.Len()) + } var isMatch bool for i := uint(0); i < uint(set.Len()); i++ { ip := set.Get(i) @@ -113,26 +121,26 @@ func (d *dfaBuilder) cachedState(set *sparseSet) int { } } if len(insts) == 0 { - return 0 + return 0, insts } d.keyBuf = instsKey(insts, d.keyBuf) v, ok := d.cache[string(d.keyBuf)] if ok { - return v + return v, insts } - d.dfa.states = append(d.dfa.states, &state{ + d.dfa.states = append(d.dfa.states, state{ insts: insts, next: make([]int, 256), match: isMatch, }) newV := len(d.dfa.states) - 1 d.cache[string(d.keyBuf)] = newV - return newV + return newV, nil } type dfa struct { insts prog - states []*state + states []state } func (d *dfa) add(set *sparseSet, ip uint) { diff --git a/vendor/github.com/couchbase/vellum/regexp/inst.go b/vendor/github.com/couchbase/vellum/regexp/inst.go index 61cbf2f33..36f2e602d 100644 --- a/vendor/github.com/couchbase/vellum/regexp/inst.go +++ b/vendor/github.com/couchbase/vellum/regexp/inst.go @@ -27,7 +27,7 @@ const ( OpRange ) -// instSize is the approxmiate size of the an inst struct in bytes +// instSize is the approximate size of the an inst struct in bytes const instSize = 40 type inst struct { diff --git a/vendor/github.com/couchbase/vellum/regexp/regexp.go b/vendor/github.com/couchbase/vellum/regexp/regexp.go index ed0e7823e..920ddc370 100644 --- a/vendor/github.com/couchbase/vellum/regexp/regexp.go +++ b/vendor/github.com/couchbase/vellum/regexp/regexp.go @@ -35,6 +35,8 @@ var ErrNoLazy = fmt.Errorf("lazy quantifiers are not allowed") // too many instructions var ErrCompiledTooBig = fmt.Errorf("too many instructions") +var DefaultLimit = uint(10 * (1 << 20)) + // Regexp implements the vellum.Automaton interface for matcing a user // specified regular expression. type Regexp struct { @@ -47,7 +49,7 @@ type Regexp struct { // compiled finite state automaton. If this size is exceeded, // ErrCompiledTooBig will be returned. func New(expr string) (*Regexp, error) { - return NewWithLimit(expr, 10*(1<<20)) + return NewWithLimit(expr, DefaultLimit) } // NewRegexpWithLimit creates a new Regular Expression automaton with @@ -59,6 +61,10 @@ func NewWithLimit(expr string, size uint) (*Regexp, error) { if err != nil { return nil, err } + return NewParsedWithLimit(expr, parsed, size) +} + +func NewParsedWithLimit(expr string, parsed *syntax.Regexp, size uint) (*Regexp, error) { compiler := newCompiler(size) insts, err := compiler.compile(parsed) if err != nil { @@ -103,7 +109,7 @@ func (r *Regexp) WillAlwaysMatch(int) bool { return false } -// Accept returns the new state, resulting from the transite byte b +// Accept returns the new state, resulting from the transition byte b // when currently in the state s. func (r *Regexp) Accept(s int, b byte) int { if s < len(r.dfa.states) { diff --git a/vendor/github.com/couchbase/vellum/registry.go b/vendor/github.com/couchbase/vellum/registry.go index 3721a7c9c..f5b9b4d59 100644 --- a/vendor/github.com/couchbase/vellum/registry.go +++ b/vendor/github.com/couchbase/vellum/registry.go @@ -14,39 +14,35 @@ package vellum -import ( - "hash" - "hash/fnv" -) - type registryCell struct { addr int node *builderNode } type registry struct { - table []registryCell - tableSize uint - mruSize uint - hasher hash.Hash64 + builderNodePool *builderNodePool + table []registryCell + tableSize uint + mruSize uint } -func newRegistry(tableSize, mruSize int) *registry { +func newRegistry(p *builderNodePool, tableSize, mruSize int) *registry { nsize := tableSize * mruSize rv := ®istry{ - table: make([]registryCell, nsize), - tableSize: uint(tableSize), - mruSize: uint(mruSize), - hasher: fnv.New64a(), + builderNodePool: p, + table: make([]registryCell, nsize), + tableSize: uint(tableSize), + mruSize: uint(mruSize), } return rv } func (r *registry) Reset() { - for i := 0; i < len(r.table); i++ { - r.table[i] = registryCell{} + var empty registryCell + for i := range r.table { + r.builderNodePool.Put(r.table[i].node) + r.table[i] = empty } - r.hasher.Reset() } func (r *registry) entry(node *builderNode) (bool, int, *registryCell) { @@ -57,7 +53,7 @@ func (r *registry) entry(node *builderNode) (bool, int, *registryCell) { start := r.mruSize * uint(bucket) end := start + r.mruSize rc := registryCache(r.table[start:end]) - return rc.entry(node) + return rc.entry(node, r.builderNodePool) } const fnvPrime = 1099511628211 @@ -81,11 +77,12 @@ func (r *registry) hash(b *builderNode) int { type registryCache []registryCell -func (r registryCache) entry(node *builderNode) (bool, int, *registryCell) { +func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, int, *registryCell) { if len(r) == 1 { if r[0].node != nil && r[0].node.equiv(node) { return true, r[0].addr, nil } + pool.Put(r[0].node) r[0].node = node return false, 0, &r[0] } @@ -98,6 +95,7 @@ func (r registryCache) entry(node *builderNode) (bool, int, *registryCell) { } // no match last := len(r) - 1 + pool.Put(r[last].node) r[last].node = node // discard LRU r.promote(last) return false, 0, &r[0] diff --git a/vendor/github.com/couchbase/vellum/utf8/utf8.go b/vendor/github.com/couchbase/vellum/utf8/utf8.go index 47dbe9d1c..54e23b937 100644 --- a/vendor/github.com/couchbase/vellum/utf8/utf8.go +++ b/vendor/github.com/couchbase/vellum/utf8/utf8.go @@ -25,19 +25,39 @@ type Sequences []Sequence // NewSequences constructs a collection of Sequence which describe the // byte ranges covered between the start and end runes. func NewSequences(start, end rune) (Sequences, error) { - var rv Sequences + rv, _, err := NewSequencesPrealloc(start, end, nil, nil, nil, nil) + return rv, err +} + +func NewSequencesPrealloc(start, end rune, + preallocSequences Sequences, + preallocRangeStack RangeStack, + preallocStartBytes, preallocEndBytes []byte) (Sequences, RangeStack, error) { + rv := preallocSequences[:0] + + startBytes := preallocStartBytes + if cap(startBytes) < utf8.UTFMax { + startBytes = make([]byte, utf8.UTFMax) + } + startBytes = startBytes[:utf8.UTFMax] - var rangeStack rangeStack - rangeStack = rangeStack.Push(&scalarRange{start, end}) + endBytes := preallocEndBytes + if cap(endBytes) < utf8.UTFMax { + endBytes = make([]byte, utf8.UTFMax) + } + endBytes = endBytes[:utf8.UTFMax] + + rangeStack := preallocRangeStack[:0] + rangeStack = rangeStack.Push(scalarRange{start, end}) rangeStack, r := rangeStack.Pop() TOP: - for r != nil { + for r != nilScalarRange { INNER: for { r1, r2 := r.split() - if r1 != nil { - rangeStack = rangeStack.Push(&scalarRange{r2.start, r2.end}) + if r1 != nilScalarRange { + rangeStack = rangeStack.Push(scalarRange{r2.start, r2.end}) r.start = r1.start r.end = r1.end continue INNER @@ -49,13 +69,13 @@ TOP: for i := 1; i < utf8.UTFMax; i++ { max := maxScalarValue(i) if r.start <= max && max < r.end { - rangeStack = rangeStack.Push(&scalarRange{max + 1, r.end}) + rangeStack = rangeStack.Push(scalarRange{max + 1, r.end}) r.end = max continue INNER } } asciiRange := r.ascii() - if asciiRange != nil { + if asciiRange != nilRange { rv = append(rv, Sequence{ asciiRange, }) @@ -66,23 +86,21 @@ TOP: m := rune((1 << (6 * i)) - 1) if (r.start & ^m) != (r.end & ^m) { if (r.start & m) != 0 { - rangeStack = rangeStack.Push(&scalarRange{(r.start | m) + 1, r.end}) + rangeStack = rangeStack.Push(scalarRange{(r.start | m) + 1, r.end}) r.end = r.start | m continue INNER } if (r.end & m) != m { - rangeStack = rangeStack.Push(&scalarRange{r.end & ^m, r.end}) + rangeStack = rangeStack.Push(scalarRange{r.end & ^m, r.end}) r.end = (r.end & ^m) - 1 continue INNER } } } - start := make([]byte, utf8.UTFMax) - end := make([]byte, utf8.UTFMax) - n, m := r.encode(start, end) - seq, err := SequenceFromEncodedRange(start[0:n], end[0:m]) + n, m := r.encode(startBytes, endBytes) + seq, err := SequenceFromEncodedRange(startBytes[0:n], endBytes[0:m]) if err != nil { - return nil, err + return nil, nil, err } rv = append(rv, seq) rangeStack, r = rangeStack.Pop() @@ -90,11 +108,11 @@ TOP: } } - return rv, nil + return rv, rangeStack, nil } -// Sequence is a collection of *Range -type Sequence []*Range +// Sequence is a collection of Range +type Sequence []Range // SequenceFromEncodedRange creates sequence from the encoded bytes func SequenceFromEncodedRange(start, end []byte) (Sequence, error) { @@ -104,21 +122,21 @@ func SequenceFromEncodedRange(start, end []byte) (Sequence, error) { switch len(start) { case 2: return Sequence{ - &Range{start[0], end[0]}, - &Range{start[1], end[1]}, + Range{start[0], end[0]}, + Range{start[1], end[1]}, }, nil case 3: return Sequence{ - &Range{start[0], end[0]}, - &Range{start[1], end[1]}, - &Range{start[2], end[2]}, + Range{start[0], end[0]}, + Range{start[1], end[1]}, + Range{start[2], end[2]}, }, nil case 4: return Sequence{ - &Range{start[0], end[0]}, - &Range{start[1], end[1]}, - &Range{start[2], end[2]}, - &Range{start[3], end[3]}, + Range{start[0], end[0]}, + Range{start[1], end[1]}, + Range{start[2], end[2]}, + Range{start[3], end[3]}, }, nil } @@ -159,6 +177,8 @@ type Range struct { End byte } +var nilRange = Range{0xff, 0} + func (u Range) matches(b byte) bool { if u.Start <= b && b <= u.End { return true @@ -178,37 +198,39 @@ type scalarRange struct { end rune } +var nilScalarRange = scalarRange{0xffff, 0} + func (s *scalarRange) String() string { return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end) } // split this scalar range if it overlaps with a surrogate codepoint -func (s *scalarRange) split() (*scalarRange, *scalarRange) { +func (s *scalarRange) split() (scalarRange, scalarRange) { if s.start < 0xe000 && s.end > 0xd7ff { - return &scalarRange{ + return scalarRange{ start: s.start, end: 0xd7ff, }, - &scalarRange{ + scalarRange{ start: 0xe000, end: s.end, } } - return nil, nil + return nilScalarRange, nilScalarRange } func (s *scalarRange) valid() bool { return s.start <= s.end } -func (s *scalarRange) ascii() *Range { +func (s *scalarRange) ascii() Range { if s.valid() && s.end <= 0x7f { - return &Range{ + return Range{ Start: byte(s.start), End: byte(s.end), } } - return nil + return nilRange } // start and end MUST have capacity for utf8.UTFMax bytes @@ -218,16 +240,16 @@ func (s *scalarRange) encode(start, end []byte) (int, int) { return n, m } -type rangeStack []*scalarRange +type RangeStack []scalarRange -func (s rangeStack) Push(v *scalarRange) rangeStack { +func (s RangeStack) Push(v scalarRange) RangeStack { return append(s, v) } -func (s rangeStack) Pop() (rangeStack, *scalarRange) { +func (s RangeStack) Pop() (RangeStack, scalarRange) { l := len(s) if l < 1 { - return s, nil + return s, nilScalarRange } return s[:l-1], s[l-1] } diff --git a/vendor/github.com/boltdb/bolt/LICENSE b/vendor/github.com/etcd-io/bbolt/LICENSE similarity index 100% rename from vendor/github.com/boltdb/bolt/LICENSE rename to vendor/github.com/etcd-io/bbolt/LICENSE diff --git a/vendor/github.com/boltdb/bolt/bolt_386.go b/vendor/github.com/etcd-io/bbolt/bolt_386.go similarity index 95% rename from vendor/github.com/boltdb/bolt/bolt_386.go rename to vendor/github.com/etcd-io/bbolt/bolt_386.go index 820d533c1..4d35ee7cf 100644 --- a/vendor/github.com/boltdb/bolt/bolt_386.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_386.go @@ -1,4 +1,4 @@ -package bolt +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. const maxMapSize = 0x7FFFFFFF // 2GB diff --git a/vendor/github.com/boltdb/bolt/bolt_amd64.go b/vendor/github.com/etcd-io/bbolt/bolt_amd64.go similarity index 95% rename from vendor/github.com/boltdb/bolt/bolt_amd64.go rename to vendor/github.com/etcd-io/bbolt/bolt_amd64.go index 98fafdb47..60a52dad5 100644 --- a/vendor/github.com/boltdb/bolt/bolt_amd64.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_amd64.go @@ -1,4 +1,4 @@ -package bolt +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. const maxMapSize = 0xFFFFFFFFFFFF // 256TB diff --git a/vendor/github.com/boltdb/bolt/bolt_arm.go b/vendor/github.com/etcd-io/bbolt/bolt_arm.go similarity index 98% rename from vendor/github.com/boltdb/bolt/bolt_arm.go rename to vendor/github.com/etcd-io/bbolt/bolt_arm.go index 7e5cb4b94..105d27ddb 100644 --- a/vendor/github.com/boltdb/bolt/bolt_arm.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_arm.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import "unsafe" diff --git a/vendor/github.com/boltdb/bolt/bolt_arm64.go b/vendor/github.com/etcd-io/bbolt/bolt_arm64.go similarity index 95% rename from vendor/github.com/boltdb/bolt/bolt_arm64.go rename to vendor/github.com/etcd-io/bbolt/bolt_arm64.go index b26d84f91..f5aa2a5ee 100644 --- a/vendor/github.com/boltdb/bolt/bolt_arm64.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_arm64.go @@ -1,6 +1,6 @@ // +build arm64 -package bolt +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. const maxMapSize = 0xFFFFFFFFFFFF // 256TB diff --git a/vendor/github.com/boltdb/bolt/bolt_linux.go b/vendor/github.com/etcd-io/bbolt/bolt_linux.go similarity index 91% rename from vendor/github.com/boltdb/bolt/bolt_linux.go rename to vendor/github.com/etcd-io/bbolt/bolt_linux.go index 2b6766614..7707bcacf 100644 --- a/vendor/github.com/boltdb/bolt/bolt_linux.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_linux.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "syscall" diff --git a/vendor/github.com/boltdb/bolt/bolt_mips64.go b/vendor/github.com/etcd-io/bbolt/bolt_mips64x.go similarity index 58% rename from vendor/github.com/boltdb/bolt/bolt_mips64.go rename to vendor/github.com/etcd-io/bbolt/bolt_mips64x.go index 9f5060942..baeb289fd 100644 --- a/vendor/github.com/boltdb/bolt/bolt_mips64.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_mips64x.go @@ -1,11 +1,12 @@ -// +build mips64 -package bolt +// +build mips64 mips64le + +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. -const maxMapSize = 0xFFFFFFFFFFFF // 256TB +const maxMapSize = 0x8000000000 // 512GB // maxAllocSize is the size used when creating array pointers. const maxAllocSize = 0x7FFFFFFF -// brokenUnaligned Are unaligned load/stores broken on this arch? +// Are unaligned load/stores broken on this arch? var brokenUnaligned = false diff --git a/vendor/github.com/boltdb/bolt/bolt_mips.go b/vendor/github.com/etcd-io/bbolt/bolt_mipsx.go similarity index 70% rename from vendor/github.com/boltdb/bolt/bolt_mips.go rename to vendor/github.com/etcd-io/bbolt/bolt_mipsx.go index 1c06342ea..2d9b1a91f 100644 --- a/vendor/github.com/boltdb/bolt/bolt_mips.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_mipsx.go @@ -1,5 +1,6 @@ -// +build mips -package bolt +// +build mips mipsle + +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. const maxMapSize = 0x40000000 // 1GB @@ -7,5 +8,5 @@ const maxMapSize = 0x40000000 // 1GB // maxAllocSize is the size used when creating array pointers. const maxAllocSize = 0xFFFFFFF -// brokenUnaligned Are unaligned load/stores broken on this arch? +// Are unaligned load/stores broken on this arch? var brokenUnaligned = false diff --git a/vendor/github.com/boltdb/bolt/bolt_openbsd.go b/vendor/github.com/etcd-io/bbolt/bolt_openbsd.go similarity index 97% rename from vendor/github.com/boltdb/bolt/bolt_openbsd.go rename to vendor/github.com/etcd-io/bbolt/bolt_openbsd.go index 7058c3d73..d7f50358e 100644 --- a/vendor/github.com/boltdb/bolt/bolt_openbsd.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_openbsd.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "syscall" diff --git a/vendor/github.com/boltdb/bolt/bolt_ppc.go b/vendor/github.com/etcd-io/bbolt/bolt_ppc.go similarity index 69% rename from vendor/github.com/boltdb/bolt/bolt_ppc.go rename to vendor/github.com/etcd-io/bbolt/bolt_ppc.go index 645ddc3ed..69804714a 100644 --- a/vendor/github.com/boltdb/bolt/bolt_ppc.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_ppc.go @@ -1,9 +1,12 @@ // +build ppc -package bolt +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. const maxMapSize = 0x7FFFFFFF // 2GB // maxAllocSize is the size used when creating array pointers. const maxAllocSize = 0xFFFFFFF + +// Are unaligned load/stores broken on this arch? +var brokenUnaligned = false diff --git a/vendor/github.com/boltdb/bolt/bolt_ppc64.go b/vendor/github.com/etcd-io/bbolt/bolt_ppc64.go similarity index 95% rename from vendor/github.com/boltdb/bolt/bolt_ppc64.go rename to vendor/github.com/etcd-io/bbolt/bolt_ppc64.go index 9331d9771..356590857 100644 --- a/vendor/github.com/boltdb/bolt/bolt_ppc64.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_ppc64.go @@ -1,6 +1,6 @@ // +build ppc64 -package bolt +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. const maxMapSize = 0xFFFFFFFFFFFF // 256TB diff --git a/vendor/github.com/boltdb/bolt/bolt_ppc64le.go b/vendor/github.com/etcd-io/bbolt/bolt_ppc64le.go similarity index 95% rename from vendor/github.com/boltdb/bolt/bolt_ppc64le.go rename to vendor/github.com/etcd-io/bbolt/bolt_ppc64le.go index 8c143bc5d..422c7c69d 100644 --- a/vendor/github.com/boltdb/bolt/bolt_ppc64le.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_ppc64le.go @@ -1,6 +1,6 @@ // +build ppc64le -package bolt +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. const maxMapSize = 0xFFFFFFFFFFFF // 256TB diff --git a/vendor/github.com/boltdb/bolt/bolt_s390x.go b/vendor/github.com/etcd-io/bbolt/bolt_s390x.go similarity index 95% rename from vendor/github.com/boltdb/bolt/bolt_s390x.go rename to vendor/github.com/etcd-io/bbolt/bolt_s390x.go index d7c39af92..6d3fcb825 100644 --- a/vendor/github.com/boltdb/bolt/bolt_s390x.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_s390x.go @@ -1,6 +1,6 @@ // +build s390x -package bolt +package bbolt // maxMapSize represents the largest mmap size supported by Bolt. const maxMapSize = 0xFFFFFFFFFFFF // 256TB diff --git a/vendor/github.com/boltdb/bolt/bolt_unix.go b/vendor/github.com/etcd-io/bbolt/bolt_unix.go similarity index 71% rename from vendor/github.com/boltdb/bolt/bolt_unix.go rename to vendor/github.com/etcd-io/bbolt/bolt_unix.go index cad62dda1..5f2bb5145 100644 --- a/vendor/github.com/boltdb/bolt/bolt_unix.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_unix.go @@ -1,41 +1,43 @@ // +build !windows,!plan9,!solaris -package bolt +package bbolt import ( "fmt" - "os" "syscall" "time" "unsafe" ) // flock acquires an advisory lock on a file descriptor. -func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error { +func flock(db *DB, exclusive bool, timeout time.Duration) error { var t time.Time + if timeout != 0 { + t = time.Now() + } + fd := db.file.Fd() + flag := syscall.LOCK_NB + if exclusive { + flag |= syscall.LOCK_EX + } else { + flag |= syscall.LOCK_SH + } for { - // If we're beyond our timeout then return an error. - // This can only occur after we've attempted a flock once. - if t.IsZero() { - t = time.Now() - } else if timeout > 0 && time.Since(t) > timeout { - return ErrTimeout - } - flag := syscall.LOCK_SH - if exclusive { - flag = syscall.LOCK_EX - } - - // Otherwise attempt to obtain an exclusive lock. - err := syscall.Flock(int(db.file.Fd()), flag|syscall.LOCK_NB) + // Attempt to obtain an exclusive lock. + err := syscall.Flock(int(fd), flag) if err == nil { return nil } else if err != syscall.EWOULDBLOCK { return err } + // If we timed out then return an error. + if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { + return ErrTimeout + } + // Wait for a bit and try again. - time.Sleep(50 * time.Millisecond) + time.Sleep(flockRetryTimeout) } } @@ -53,7 +55,9 @@ func mmap(db *DB, sz int) error { } // Advise the kernel that the mmap is accessed randomly. - if err := madvise(b, syscall.MADV_RANDOM); err != nil { + err = madvise(b, syscall.MADV_RANDOM) + if err != nil && err != syscall.ENOSYS { + // Ignore not implemented error in kernel because it still works. return fmt.Errorf("madvise: %s", err) } diff --git a/vendor/github.com/boltdb/bolt/bolt_unix_solaris.go b/vendor/github.com/etcd-io/bbolt/bolt_unix_solaris.go similarity index 70% rename from vendor/github.com/boltdb/bolt/bolt_unix_solaris.go rename to vendor/github.com/etcd-io/bbolt/bolt_unix_solaris.go index 307bf2b3e..babad6578 100644 --- a/vendor/github.com/boltdb/bolt/bolt_unix_solaris.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_unix_solaris.go @@ -1,8 +1,7 @@ -package bolt +package bbolt import ( "fmt" - "os" "syscall" "time" "unsafe" @@ -11,36 +10,35 @@ import ( ) // flock acquires an advisory lock on a file descriptor. -func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error { +func flock(db *DB, exclusive bool, timeout time.Duration) error { var t time.Time + if timeout != 0 { + t = time.Now() + } + fd := db.file.Fd() + var lockType int16 + if exclusive { + lockType = syscall.F_WRLCK + } else { + lockType = syscall.F_RDLCK + } for { - // If we're beyond our timeout then return an error. - // This can only occur after we've attempted a flock once. - if t.IsZero() { - t = time.Now() - } else if timeout > 0 && time.Since(t) > timeout { - return ErrTimeout - } - var lock syscall.Flock_t - lock.Start = 0 - lock.Len = 0 - lock.Pid = 0 - lock.Whence = 0 - lock.Pid = 0 - if exclusive { - lock.Type = syscall.F_WRLCK - } else { - lock.Type = syscall.F_RDLCK - } - err := syscall.FcntlFlock(db.file.Fd(), syscall.F_SETLK, &lock) + // Attempt to obtain an exclusive lock. + lock := syscall.Flock_t{Type: lockType} + err := syscall.FcntlFlock(fd, syscall.F_SETLK, &lock) if err == nil { return nil } else if err != syscall.EAGAIN { return err } + // If we timed out then return an error. + if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { + return ErrTimeout + } + // Wait for a bit and try again. - time.Sleep(50 * time.Millisecond) + time.Sleep(flockRetryTimeout) } } diff --git a/vendor/github.com/boltdb/bolt/bolt_windows.go b/vendor/github.com/etcd-io/bbolt/bolt_windows.go similarity index 76% rename from vendor/github.com/boltdb/bolt/bolt_windows.go rename to vendor/github.com/etcd-io/bbolt/bolt_windows.go index b00fb0720..fca178bd2 100644 --- a/vendor/github.com/boltdb/bolt/bolt_windows.go +++ b/vendor/github.com/etcd-io/bbolt/bolt_windows.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "fmt" @@ -16,8 +16,6 @@ var ( ) const ( - lockExt = ".lock" - // see https://msdn.microsoft.com/en-us/library/windows/desktop/aa365203(v=vs.85).aspx flagLockExclusive = 2 flagLockFailImmediately = 1 @@ -48,48 +46,47 @@ func fdatasync(db *DB) error { } // flock acquires an advisory lock on a file descriptor. -func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error { - // Create a separate lock file on windows because a process - // cannot share an exclusive lock on the same file. This is - // needed during Tx.WriteTo(). - f, err := os.OpenFile(db.path+lockExt, os.O_CREATE, mode) - if err != nil { - return err - } - db.lockfile = f - +func flock(db *DB, exclusive bool, timeout time.Duration) error { var t time.Time + if timeout != 0 { + t = time.Now() + } + var flag uint32 = flagLockFailImmediately + if exclusive { + flag |= flagLockExclusive + } for { - // If we're beyond our timeout then return an error. - // This can only occur after we've attempted a flock once. - if t.IsZero() { - t = time.Now() - } else if timeout > 0 && time.Since(t) > timeout { - return ErrTimeout - } - - var flag uint32 = flagLockFailImmediately - if exclusive { - flag |= flagLockExclusive - } + // Fix for https://github.com/etcd-io/bbolt/issues/121. Use byte-range + // -1..0 as the lock on the database file. + var m1 uint32 = (1 << 32) - 1 // -1 in a uint32 + err := lockFileEx(syscall.Handle(db.file.Fd()), flag, 0, 1, 0, &syscall.Overlapped{ + Offset: m1, + OffsetHigh: m1, + }) - err := lockFileEx(syscall.Handle(db.lockfile.Fd()), flag, 0, 1, 0, &syscall.Overlapped{}) if err == nil { return nil } else if err != errLockViolation { return err } + // If we timed oumercit then return an error. + if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { + return ErrTimeout + } + // Wait for a bit and try again. - time.Sleep(50 * time.Millisecond) + time.Sleep(flockRetryTimeout) } } // funlock releases an advisory lock on a file descriptor. func funlock(db *DB) error { - err := unlockFileEx(syscall.Handle(db.lockfile.Fd()), 0, 1, 0, &syscall.Overlapped{}) - db.lockfile.Close() - os.Remove(db.path + lockExt) + var m1 uint32 = (1 << 32) - 1 // -1 in a uint32 + err := unlockFileEx(syscall.Handle(db.file.Fd()), 0, 1, 0, &syscall.Overlapped{ + Offset: m1, + OffsetHigh: m1, + }) return err } diff --git a/vendor/github.com/boltdb/bolt/boltsync_unix.go b/vendor/github.com/etcd-io/bbolt/boltsync_unix.go similarity index 91% rename from vendor/github.com/boltdb/bolt/boltsync_unix.go rename to vendor/github.com/etcd-io/bbolt/boltsync_unix.go index f50442523..9587afefe 100644 --- a/vendor/github.com/boltdb/bolt/boltsync_unix.go +++ b/vendor/github.com/etcd-io/bbolt/boltsync_unix.go @@ -1,6 +1,6 @@ // +build !windows,!plan9,!linux,!openbsd -package bolt +package bbolt // fdatasync flushes written data to a file descriptor. func fdatasync(db *DB) error { diff --git a/vendor/github.com/boltdb/bolt/bucket.go b/vendor/github.com/etcd-io/bbolt/bucket.go similarity index 99% rename from vendor/github.com/boltdb/bolt/bucket.go rename to vendor/github.com/etcd-io/bbolt/bucket.go index 0c5bf2746..84bfd4d6a 100644 --- a/vendor/github.com/boltdb/bolt/bucket.go +++ b/vendor/github.com/etcd-io/bbolt/bucket.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "bytes" @@ -14,13 +14,6 @@ const ( MaxValueSize = (1 << 31) - 2 ) -const ( - maxUint = ^uint(0) - minUint = 0 - maxInt = int(^uint(0) >> 1) - minInt = -maxInt - 1 -) - const bucketHeaderSize = int(unsafe.Sizeof(bucket{})) const ( @@ -323,7 +316,12 @@ func (b *Bucket) Delete(key []byte) error { // Move cursor to correct position. c := b.Cursor() - _, _, flags := c.seek(key) + k, _, flags := c.seek(key) + + // Return nil if the key doesn't exist. + if !bytes.Equal(key, k) { + return nil + } // Return an error if there is already existing bucket value. if (flags & bucketLeafFlag) != 0 { diff --git a/vendor/github.com/boltdb/bolt/cursor.go b/vendor/github.com/etcd-io/bbolt/cursor.go similarity index 99% rename from vendor/github.com/boltdb/bolt/cursor.go rename to vendor/github.com/etcd-io/bbolt/cursor.go index 1be9f35e3..3000aced6 100644 --- a/vendor/github.com/boltdb/bolt/cursor.go +++ b/vendor/github.com/etcd-io/bbolt/cursor.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "bytes" @@ -157,12 +157,6 @@ func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) { // Start from root page/node and traverse to correct page. c.stack = c.stack[:0] c.search(seek, c.bucket.root) - ref := &c.stack[len(c.stack)-1] - - // If the cursor is pointing to the end of page/node then return nil. - if ref.index >= ref.count() { - return nil, nil, 0 - } // If this is a bucket then return a nil value. return c.keyValue() @@ -339,6 +333,8 @@ func (c *Cursor) nsearch(key []byte) { // keyValue returns the key and value of the current leaf element. func (c *Cursor) keyValue() ([]byte, []byte, uint32) { ref := &c.stack[len(c.stack)-1] + + // If the cursor is pointing to the end of page/node then return nil. if ref.count() == 0 || ref.index >= ref.count() { return nil, nil, 0 } diff --git a/vendor/github.com/boltdb/bolt/db.go b/vendor/github.com/etcd-io/bbolt/db.go similarity index 81% rename from vendor/github.com/boltdb/bolt/db.go rename to vendor/github.com/etcd-io/bbolt/db.go index f352ff14f..962248c99 100644 --- a/vendor/github.com/boltdb/bolt/db.go +++ b/vendor/github.com/etcd-io/bbolt/db.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "errors" @@ -7,8 +7,7 @@ import ( "log" "os" "runtime" - "runtime/debug" - "strings" + "sort" "sync" "time" "unsafe" @@ -23,6 +22,8 @@ const version = 2 // Represents a marker value to indicate that a file is a Bolt DB. const magic uint32 = 0xED0CDAED +const pgidNoFreelist pgid = 0xffffffffffffffff + // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when // syncing changes to a file. This is required as some operating systems, // such as OpenBSD, do not have a unified buffer cache (UBC) and writes @@ -39,6 +40,19 @@ const ( // default page size for db is set to the OS page size. var defaultPageSize = os.Getpagesize() +// The time elapsed between consecutive file locking attempts. +const flockRetryTimeout = 50 * time.Millisecond + +// FreelistType is the type of the freelist backend +type FreelistType string + +const ( + // FreelistArrayType indicates backend freelist type is array + FreelistArrayType = FreelistType("array") + // FreelistMapType indicates backend freelist type is hashmap + FreelistMapType = FreelistType("hashmap") +) + // DB represents a collection of buckets persisted to a file on disk. // All data access is performed through transactions which can be obtained through the DB. // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. @@ -61,6 +75,18 @@ type DB struct { // THIS IS UNSAFE. PLEASE USE WITH CAUTION. NoSync bool + // When true, skips syncing freelist to disk. This improves the database + // write performance under normal operation, but requires a full database + // re-sync during recovery. + NoFreelistSync bool + + // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures + // dramatic performance degradation if database is large and framentation in freelist is common. + // The alternative one is using hashmap, it is faster in almost all circumstances + // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. + // The default type is array + FreelistType FreelistType + // When true, skips the truncate call when growing the database. // Setting this to true is only safe on non-ext3/ext4 systems. // Skipping truncation avoids preallocation of hard drive space and @@ -96,8 +122,7 @@ type DB struct { path string file *os.File - lockfile *os.File // windows only - dataref []byte // mmap'ed readonly, write throws SEGV + dataref []byte // mmap'ed readonly, write throws SEGV data *[maxMapSize]byte datasz int filesz int // current on disk file size @@ -107,9 +132,11 @@ type DB struct { opened bool rwtx *Tx txs []*Tx - freelist *freelist stats Stats + freelist *freelist + freelistLoad sync.Once + pagePool sync.Pool batchMu sync.Mutex @@ -148,14 +175,18 @@ func (db *DB) String() string { // If the file does not exist then it will be created automatically. // Passing in nil options will cause Bolt to open the database with the default options. func Open(path string, mode os.FileMode, options *Options) (*DB, error) { - var db = &DB{opened: true} - + db := &DB{ + opened: true, + } // Set default options if no options are provided. if options == nil { options = DefaultOptions } + db.NoSync = options.NoSync db.NoGrowSync = options.NoGrowSync db.MmapFlags = options.MmapFlags + db.NoFreelistSync = options.NoFreelistSync + db.FreelistType = options.FreelistType // Set default values for later DB operations. db.MaxBatchSize = DefaultMaxBatchSize @@ -183,7 +214,7 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) { // if !options.ReadOnly. // The database file is locked using the shared lock (more than one process may // hold a lock at the same time) otherwise (options.ReadOnly is set). - if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil { + if err := flock(db, !db.readOnly, options.Timeout); err != nil { _ = db.close() return nil, err } @@ -191,31 +222,41 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) { // Default values for test hooks db.ops.writeAt = db.file.WriteAt + if db.pageSize = options.PageSize; db.pageSize == 0 { + // Set the default page size to the OS page size. + db.pageSize = defaultPageSize + } + // Initialize the database if it doesn't exist. if info, err := db.file.Stat(); err != nil { + _ = db.close() return nil, err } else if info.Size() == 0 { // Initialize new files with meta pages. if err := db.init(); err != nil { + // clean up file descriptor on initialization fail + _ = db.close() return nil, err } } else { // Read the first meta page to determine the page size. var buf [0x1000]byte - if _, err := db.file.ReadAt(buf[:], 0); err == nil { - m := db.pageInBuffer(buf[:], 0).meta() - if err := m.validate(); err != nil { - // If we can't read the page size, we can assume it's the same - // as the OS -- since that's how the page size was chosen in the - // first place. - // - // If the first page is invalid and this OS uses a different - // page size than what the database was created with then we - // are out of luck and cannot access the database. - db.pageSize = os.Getpagesize() - } else { + // If we can't read the page size, but can read a page, assume + // it's the same as the OS or one given -- since that's how the + // page size was chosen in the first place. + // + // If the first page is invalid and this OS uses a different + // page size than what the database was created with then we + // are out of luck and cannot access the database. + // + // TODO: scan for next page + if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) { + if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil { db.pageSize = int(m.pageSize) } + } else { + _ = db.close() + return nil, ErrInvalid } } @@ -232,14 +273,50 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) { return nil, err } - // Read in the freelist. - db.freelist = newFreelist() - db.freelist.read(db.page(db.meta().freelist)) + if db.readOnly { + return db, nil + } + + db.loadFreelist() + + // Flush freelist when transitioning from no sync to sync so + // NoFreelistSync unaware boltdb can open the db later. + if !db.NoFreelistSync && !db.hasSyncedFreelist() { + tx, err := db.Begin(true) + if tx != nil { + err = tx.Commit() + } + if err != nil { + _ = db.close() + return nil, err + } + } // Mark the database as opened and return. return db, nil } +// loadFreelist reads the freelist if it is synced, or reconstructs it +// by scanning the DB if it is not synced. It assumes there are no +// concurrent accesses being made to the freelist. +func (db *DB) loadFreelist() { + db.freelistLoad.Do(func() { + db.freelist = newFreelist(db.FreelistType) + if !db.hasSyncedFreelist() { + // Reconstruct free list by scanning the DB. + db.freelist.readIDs(db.freepages()) + } else { + // Read free list from freelist page. + db.freelist.read(db.page(db.meta().freelist)) + } + db.stats.FreePageN = db.freelist.free_count() + }) +} + +func (db *DB) hasSyncedFreelist() bool { + return db.meta().freelist != pgidNoFreelist +} + // mmap opens the underlying memory-mapped file and initializes the meta references. // minsz is the minimum size that the new mmap can be. func (db *DB) mmap(minsz int) error { @@ -341,9 +418,6 @@ func (db *DB) mmapSize(size int) (int, error) { // init creates a new database file and initializes its meta pages. func (db *DB) init() error { - // Set the page size to the OS page size. - db.pageSize = os.Getpagesize() - // Create two meta pages on a buffer. buf := make([]byte, db.pageSize*4) for i := 0; i < 2; i++ { @@ -387,7 +461,8 @@ func (db *DB) init() error { } // Close releases all database resources. -// All transactions must be closed before closing the database. +// It will block waiting for any open transactions to finish +// before closing the database and returning. func (db *DB) Close() error { db.rwlock.Lock() defer db.rwlock.Unlock() @@ -395,8 +470,8 @@ func (db *DB) Close() error { db.metalock.Lock() defer db.metalock.Unlock() - db.mmaplock.RLock() - defer db.mmaplock.RUnlock() + db.mmaplock.Lock() + defer db.mmaplock.Unlock() return db.close() } @@ -526,21 +601,36 @@ func (db *DB) beginRWTx() (*Tx, error) { t := &Tx{writable: true} t.init(db) db.rwtx = t + db.freePages() + return t, nil +} - // Free any pages associated with closed read-only transactions. - var minid txid = 0xFFFFFFFFFFFFFFFF - for _, t := range db.txs { - if t.meta.txid < minid { - minid = t.meta.txid - } +// freePages releases any pages associated with closed read-only transactions. +func (db *DB) freePages() { + // Free all pending pages prior to earliest open transaction. + sort.Sort(txsById(db.txs)) + minid := txid(0xFFFFFFFFFFFFFFFF) + if len(db.txs) > 0 { + minid = db.txs[0].meta.txid } if minid > 0 { db.freelist.release(minid - 1) } - - return t, nil + // Release unused txid extents. + for _, t := range db.txs { + db.freelist.releaseRange(minid, t.meta.txid-1) + minid = t.meta.txid + 1 + } + db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF)) + // Any page both allocated and freed in an extent is safe to release. } +type txsById []*Tx + +func (t txsById) Len() int { return len(t) } +func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid } + // removeTx removes a transaction from the database. func (db *DB) removeTx(tx *Tx) { // Release the read lock on the mmap. @@ -633,11 +723,7 @@ func (db *DB) View(fn func(*Tx) error) error { return err } - if err := t.Rollback(); err != nil { - return err - } - - return nil + return t.Rollback() } // Batch calls fn as part of a batch. It behaves similar to Update, @@ -737,9 +823,7 @@ retry: // pass success, or bolt internal errors, to all callers for _, c := range b.calls { - if c.err != nil { - c.err <- err - } + c.err <- err } break retry } @@ -826,7 +910,7 @@ func (db *DB) meta() *meta { } // allocate returns a contiguous block of memory starting at a given page. -func (db *DB) allocate(count int) (*page, error) { +func (db *DB) allocate(txid txid, count int) (*page, error) { // Allocate a temporary buffer for the page. var buf []byte if count == 1 { @@ -838,7 +922,7 @@ func (db *DB) allocate(count int) (*page, error) { p.overflow = uint32(count - 1) // Use pages from the freelist if they are available. - if p.id = db.freelist.allocate(count); p.id != 0 { + if p.id = db.freelist.allocate(txid, count); p.id != 0 { return p, nil } @@ -893,6 +977,38 @@ func (db *DB) IsReadOnly() bool { return db.readOnly } +func (db *DB) freepages() []pgid { + tx, err := db.beginTx() + defer func() { + err = tx.Rollback() + if err != nil { + panic("freepages: failed to rollback tx") + } + }() + if err != nil { + panic("freepages: failed to open read only tx") + } + + reachable := make(map[pgid]*page) + nofreed := make(map[pgid]bool) + ech := make(chan error) + go func() { + for e := range ech { + panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e)) + } + }() + tx.checkBucket(&tx.root, reachable, nofreed, ech) + close(ech) + + var fids []pgid + for i := pgid(2); i < db.meta().pgid; i++ { + if _, ok := reachable[i]; !ok { + fids = append(fids, i) + } + } + return fids +} + // Options represents the options that can be set when opening a database. type Options struct { // Timeout is the amount of time to wait to obtain a file lock. @@ -903,6 +1019,17 @@ type Options struct { // Sets the DB.NoGrowSync flag before memory mapping the file. NoGrowSync bool + // Do not sync freelist to disk. This improves the database write performance + // under normal operation, but requires a full database re-sync during recovery. + NoFreelistSync bool + + // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures + // dramatic performance degradation if database is large and framentation in freelist is common. + // The alternative one is using hashmap, it is faster in almost all circumstances + // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. + // The default type is array + FreelistType FreelistType + // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to // grab a shared lock (UNIX). ReadOnly bool @@ -919,13 +1046,22 @@ type Options struct { // If initialMmapSize is smaller than the previous database size, // it takes no effect. InitialMmapSize int + + // PageSize overrides the default OS page size. + PageSize int + + // NoSync sets the initial value of DB.NoSync. Normally this can just be + // set directly on the DB itself when returned from Open(), but this option + // is useful in APIs which expose Options but not the underlying DB. + NoSync bool } // DefaultOptions represent the options used if nil options are passed into Open(). // No timeout is used which will cause Bolt to wait indefinitely for a lock. var DefaultOptions = &Options{ - Timeout: 0, - NoGrowSync: false, + Timeout: 0, + NoGrowSync: false, + FreelistType: FreelistArrayType, } // Stats represents statistics about the database. @@ -960,10 +1096,6 @@ func (s *Stats) Sub(other *Stats) Stats { return diff } -func (s *Stats) add(other *Stats) { - s.TxStats.add(&other.TxStats) -} - type Info struct { Data uintptr PageSize int @@ -1002,7 +1134,8 @@ func (m *meta) copy(dest *meta) { func (m *meta) write(p *page) { if m.root.root >= m.pgid { panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)) - } else if m.freelist >= m.pgid { + } else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist { + // TODO: reject pgidNoFreeList if !NoFreelistSync panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)) } @@ -1029,11 +1162,3 @@ func _assert(condition bool, msg string, v ...interface{}) { panic(fmt.Sprintf("assertion failed: "+msg, v...)) } } - -func warn(v ...interface{}) { fmt.Fprintln(os.Stderr, v...) } -func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) } - -func printstack() { - stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n") - fmt.Fprintln(os.Stderr, stack) -} diff --git a/vendor/github.com/boltdb/bolt/doc.go b/vendor/github.com/etcd-io/bbolt/doc.go similarity index 94% rename from vendor/github.com/boltdb/bolt/doc.go rename to vendor/github.com/etcd-io/bbolt/doc.go index cc937845d..95f25f01c 100644 --- a/vendor/github.com/boltdb/bolt/doc.go +++ b/vendor/github.com/etcd-io/bbolt/doc.go @@ -1,5 +1,5 @@ /* -Package bolt implements a low-level key/value store in pure Go. It supports +package bbolt implements a low-level key/value store in pure Go. It supports fully serializable transactions, ACID semantics, and lock-free MVCC with multiple readers and a single writer. Bolt can be used for projects that want a simple data store without the need to add large dependencies such as @@ -41,4 +41,4 @@ point to different data or can point to invalid memory which will cause a panic. */ -package bolt +package bbolt diff --git a/vendor/github.com/boltdb/bolt/errors.go b/vendor/github.com/etcd-io/bbolt/errors.go similarity index 99% rename from vendor/github.com/boltdb/bolt/errors.go rename to vendor/github.com/etcd-io/bbolt/errors.go index a3620a3eb..48758ca57 100644 --- a/vendor/github.com/boltdb/bolt/errors.go +++ b/vendor/github.com/etcd-io/bbolt/errors.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import "errors" diff --git a/vendor/github.com/etcd-io/bbolt/freelist.go b/vendor/github.com/etcd-io/bbolt/freelist.go new file mode 100644 index 000000000..93fd85d50 --- /dev/null +++ b/vendor/github.com/etcd-io/bbolt/freelist.go @@ -0,0 +1,370 @@ +package bbolt + +import ( + "fmt" + "sort" + "unsafe" +) + +// txPending holds a list of pgids and corresponding allocation txns +// that are pending to be freed. +type txPending struct { + ids []pgid + alloctx []txid // txids allocating the ids + lastReleaseBegin txid // beginning txid of last matching releaseRange +} + +// pidSet holds the set of starting pgids which have the same span size +type pidSet map[pgid]struct{} + +// freelist represents a list of all pages that are available for allocation. +// It also tracks pages that have been freed but are still in use by open transactions. +type freelist struct { + freelistType FreelistType // freelist type + ids []pgid // all free and available free page ids. + allocs map[pgid]txid // mapping of txid that allocated a pgid. + pending map[txid]*txPending // mapping of soon-to-be free page ids by tx. + cache map[pgid]bool // fast lookup of all free and pending page ids. + freemaps map[uint64]pidSet // key is the size of continuous pages(span), value is a set which contains the starting pgids of same size + forwardMap map[pgid]uint64 // key is start pgid, value is its span size + backwardMap map[pgid]uint64 // key is end pgid, value is its span size + allocate func(txid txid, n int) pgid // the freelist allocate func + free_count func() int // the function which gives you free page number + mergeSpans func(ids pgids) // the mergeSpan func + getFreePageIDs func() []pgid // get free pgids func + readIDs func(pgids []pgid) // readIDs func reads list of pages and init the freelist +} + +// newFreelist returns an empty, initialized freelist. +func newFreelist(freelistType FreelistType) *freelist { + f := &freelist{ + freelistType: freelistType, + allocs: make(map[pgid]txid), + pending: make(map[txid]*txPending), + cache: make(map[pgid]bool), + freemaps: make(map[uint64]pidSet), + forwardMap: make(map[pgid]uint64), + backwardMap: make(map[pgid]uint64), + } + + if freelistType == FreelistMapType { + f.allocate = f.hashmapAllocate + f.free_count = f.hashmapFreeCount + f.mergeSpans = f.hashmapMergeSpans + f.getFreePageIDs = f.hashmapGetFreePageIDs + f.readIDs = f.hashmapReadIDs + } else { + f.allocate = f.arrayAllocate + f.free_count = f.arrayFreeCount + f.mergeSpans = f.arrayMergeSpans + f.getFreePageIDs = f.arrayGetFreePageIDs + f.readIDs = f.arrayReadIDs + } + + return f +} + +// size returns the size of the page after serialization. +func (f *freelist) size() int { + n := f.count() + if n >= 0xFFFF { + // The first element will be used to store the count. See freelist.write. + n++ + } + return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * n) +} + +// count returns count of pages on the freelist +func (f *freelist) count() int { + return f.free_count() + f.pending_count() +} + +// arrayFreeCount returns count of free pages(array version) +func (f *freelist) arrayFreeCount() int { + return len(f.ids) +} + +// pending_count returns count of pending pages +func (f *freelist) pending_count() int { + var count int + for _, txp := range f.pending { + count += len(txp.ids) + } + return count +} + +// copyall copies into dst a list of all free ids and all pending ids in one sorted list. +// f.count returns the minimum length required for dst. +func (f *freelist) copyall(dst []pgid) { + m := make(pgids, 0, f.pending_count()) + for _, txp := range f.pending { + m = append(m, txp.ids...) + } + sort.Sort(m) + mergepgids(dst, f.getFreePageIDs(), m) +} + +// arrayAllocate returns the starting page id of a contiguous list of pages of a given size. +// If a contiguous block cannot be found then 0 is returned. +func (f *freelist) arrayAllocate(txid txid, n int) pgid { + if len(f.ids) == 0 { + return 0 + } + + var initial, previd pgid + for i, id := range f.ids { + if id <= 1 { + panic(fmt.Sprintf("invalid page allocation: %d", id)) + } + + // Reset initial page if this is not contiguous. + if previd == 0 || id-previd != 1 { + initial = id + } + + // If we found a contiguous block then remove it and return it. + if (id-initial)+1 == pgid(n) { + // If we're allocating off the beginning then take the fast path + // and just adjust the existing slice. This will use extra memory + // temporarily but the append() in free() will realloc the slice + // as is necessary. + if (i + 1) == n { + f.ids = f.ids[i+1:] + } else { + copy(f.ids[i-n+1:], f.ids[i+1:]) + f.ids = f.ids[:len(f.ids)-n] + } + + // Remove from the free cache. + for i := pgid(0); i < pgid(n); i++ { + delete(f.cache, initial+i) + } + f.allocs[initial] = txid + return initial + } + + previd = id + } + return 0 +} + +// free releases a page and its overflow for a given transaction id. +// If the page is already free then a panic will occur. +func (f *freelist) free(txid txid, p *page) { + if p.id <= 1 { + panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id)) + } + + // Free page and all its overflow pages. + txp := f.pending[txid] + if txp == nil { + txp = &txPending{} + f.pending[txid] = txp + } + allocTxid, ok := f.allocs[p.id] + if ok { + delete(f.allocs, p.id) + } else if (p.flags & freelistPageFlag) != 0 { + // Freelist is always allocated by prior tx. + allocTxid = txid - 1 + } + + for id := p.id; id <= p.id+pgid(p.overflow); id++ { + // Verify that page is not already free. + if f.cache[id] { + panic(fmt.Sprintf("page %d already freed", id)) + } + // Add to the freelist and cache. + txp.ids = append(txp.ids, id) + txp.alloctx = append(txp.alloctx, allocTxid) + f.cache[id] = true + } +} + +// release moves all page ids for a transaction id (or older) to the freelist. +func (f *freelist) release(txid txid) { + m := make(pgids, 0) + for tid, txp := range f.pending { + if tid <= txid { + // Move transaction's pending pages to the available freelist. + // Don't remove from the cache since the page is still free. + m = append(m, txp.ids...) + delete(f.pending, tid) + } + } + f.mergeSpans(m) +} + +// releaseRange moves pending pages allocated within an extent [begin,end] to the free list. +func (f *freelist) releaseRange(begin, end txid) { + if begin > end { + return + } + var m pgids + for tid, txp := range f.pending { + if tid < begin || tid > end { + continue + } + // Don't recompute freed pages if ranges haven't updated. + if txp.lastReleaseBegin == begin { + continue + } + for i := 0; i < len(txp.ids); i++ { + if atx := txp.alloctx[i]; atx < begin || atx > end { + continue + } + m = append(m, txp.ids[i]) + txp.ids[i] = txp.ids[len(txp.ids)-1] + txp.ids = txp.ids[:len(txp.ids)-1] + txp.alloctx[i] = txp.alloctx[len(txp.alloctx)-1] + txp.alloctx = txp.alloctx[:len(txp.alloctx)-1] + i-- + } + txp.lastReleaseBegin = begin + if len(txp.ids) == 0 { + delete(f.pending, tid) + } + } + f.mergeSpans(m) +} + +// rollback removes the pages from a given pending tx. +func (f *freelist) rollback(txid txid) { + // Remove page ids from cache. + txp := f.pending[txid] + if txp == nil { + return + } + var m pgids + for i, pgid := range txp.ids { + delete(f.cache, pgid) + tx := txp.alloctx[i] + if tx == 0 { + continue + } + if tx != txid { + // Pending free aborted; restore page back to alloc list. + f.allocs[pgid] = tx + } else { + // Freed page was allocated by this txn; OK to throw away. + m = append(m, pgid) + } + } + // Remove pages from pending list and mark as free if allocated by txid. + delete(f.pending, txid) + f.mergeSpans(m) +} + +// freed returns whether a given page is in the free list. +func (f *freelist) freed(pgid pgid) bool { + return f.cache[pgid] +} + +// read initializes the freelist from a freelist page. +func (f *freelist) read(p *page) { + if (p.flags & freelistPageFlag) == 0 { + panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.id, p.typ())) + } + // If the page.count is at the max uint16 value (64k) then it's considered + // an overflow and the size of the freelist is stored as the first element. + idx, count := 0, int(p.count) + if count == 0xFFFF { + idx = 1 + count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0]) + } + + // Copy the list of page ids from the freelist. + if count == 0 { + f.ids = nil + } else { + ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx : idx+count] + + // copy the ids, so we don't modify on the freelist page directly + idsCopy := make([]pgid, count) + copy(idsCopy, ids) + // Make sure they're sorted. + sort.Sort(pgids(idsCopy)) + + f.readIDs(idsCopy) + } +} + +// arrayReadIDs initializes the freelist from a given list of ids. +func (f *freelist) arrayReadIDs(ids []pgid) { + f.ids = ids + f.reindex() +} + +func (f *freelist) arrayGetFreePageIDs() []pgid { + return f.ids +} + +// write writes the page ids onto a freelist page. All free and pending ids are +// saved to disk since in the event of a program crash, all pending ids will +// become free. +func (f *freelist) write(p *page) error { + // Combine the old free pgids and pgids waiting on an open transaction. + + // Update the header flag. + p.flags |= freelistPageFlag + + // The page.count can only hold up to 64k elements so if we overflow that + // number then we handle it by putting the size in the first element. + lenids := f.count() + if lenids == 0 { + p.count = uint16(lenids) + } else if lenids < 0xFFFF { + p.count = uint16(lenids) + f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:]) + } else { + p.count = 0xFFFF + ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(lenids) + f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:]) + } + + return nil +} + +// reload reads the freelist from a page and filters out pending items. +func (f *freelist) reload(p *page) { + f.read(p) + + // Build a cache of only pending pages. + pcache := make(map[pgid]bool) + for _, txp := range f.pending { + for _, pendingID := range txp.ids { + pcache[pendingID] = true + } + } + + // Check each page in the freelist and build a new available freelist + // with any pages not in the pending lists. + var a []pgid + for _, id := range f.getFreePageIDs() { + if !pcache[id] { + a = append(a, id) + } + } + + f.readIDs(a) +} + +// reindex rebuilds the free cache based on available and pending free lists. +func (f *freelist) reindex() { + ids := f.getFreePageIDs() + f.cache = make(map[pgid]bool, len(ids)) + for _, id := range ids { + f.cache[id] = true + } + for _, txp := range f.pending { + for _, pendingID := range txp.ids { + f.cache[pendingID] = true + } + } +} + +// arrayMergeSpans try to merge list of pages(represented by pgids) with existing spans but using array +func (f *freelist) arrayMergeSpans(ids pgids) { + sort.Sort(ids) + f.ids = pgids(f.ids).merge(ids) +} diff --git a/vendor/github.com/etcd-io/bbolt/freelist_hmap.go b/vendor/github.com/etcd-io/bbolt/freelist_hmap.go new file mode 100644 index 000000000..6a03a6c3c --- /dev/null +++ b/vendor/github.com/etcd-io/bbolt/freelist_hmap.go @@ -0,0 +1,178 @@ +package bbolt + +import "sort" + +// hashmapFreeCount returns count of free pages(hashmap version) +func (f *freelist) hashmapFreeCount() int { + // use the forwardmap to get the total count + count := 0 + for _, size := range f.forwardMap { + count += int(size) + } + return count +} + +// hashmapAllocate serves the same purpose as arrayAllocate, but use hashmap as backend +func (f *freelist) hashmapAllocate(txid txid, n int) pgid { + if n == 0 { + return 0 + } + + // if we have a exact size match just return short path + if bm, ok := f.freemaps[uint64(n)]; ok { + for pid := range bm { + // remove the span + f.delSpan(pid, uint64(n)) + + f.allocs[pid] = txid + + for i := pgid(0); i < pgid(n); i++ { + delete(f.cache, pid+pgid(i)) + } + return pid + } + } + + // lookup the map to find larger span + for size, bm := range f.freemaps { + if size < uint64(n) { + continue + } + + for pid := range bm { + // remove the initial + f.delSpan(pid, uint64(size)) + + f.allocs[pid] = txid + + remain := size - uint64(n) + + // add remain span + f.addSpan(pid+pgid(n), remain) + + for i := pgid(0); i < pgid(n); i++ { + delete(f.cache, pid+pgid(i)) + } + return pid + } + } + + return 0 +} + +// hashmapReadIDs reads pgids as input an initial the freelist(hashmap version) +func (f *freelist) hashmapReadIDs(pgids []pgid) { + f.init(pgids) + + // Rebuild the page cache. + f.reindex() +} + +// hashmapGetFreePageIDs returns the sorted free page ids +func (f *freelist) hashmapGetFreePageIDs() []pgid { + count := f.free_count() + if count == 0 { + return nil + } + + m := make([]pgid, 0, count) + for start, size := range f.forwardMap { + for i := 0; i < int(size); i++ { + m = append(m, start+pgid(i)) + } + } + sort.Sort(pgids(m)) + + return m +} + +// hashmapMergeSpans try to merge list of pages(represented by pgids) with existing spans +func (f *freelist) hashmapMergeSpans(ids pgids) { + for _, id := range ids { + // try to see if we can merge and update + f.mergeWithExistingSpan(id) + } +} + +// mergeWithExistingSpan merges pid to the existing free spans, try to merge it backward and forward +func (f *freelist) mergeWithExistingSpan(pid pgid) { + prev := pid - 1 + next := pid + 1 + + preSize, mergeWithPrev := f.backwardMap[prev] + nextSize, mergeWithNext := f.forwardMap[next] + newStart := pid + newSize := uint64(1) + + if mergeWithPrev { + //merge with previous span + start := prev + 1 - pgid(preSize) + f.delSpan(start, preSize) + + newStart -= pgid(preSize) + newSize += preSize + } + + if mergeWithNext { + // merge with next span + f.delSpan(next, nextSize) + newSize += nextSize + } + + f.addSpan(newStart, newSize) +} + +func (f *freelist) addSpan(start pgid, size uint64) { + f.backwardMap[start-1+pgid(size)] = size + f.forwardMap[start] = size + if _, ok := f.freemaps[size]; !ok { + f.freemaps[size] = make(map[pgid]struct{}) + } + + f.freemaps[size][start] = struct{}{} +} + +func (f *freelist) delSpan(start pgid, size uint64) { + delete(f.forwardMap, start) + delete(f.backwardMap, start+pgid(size-1)) + delete(f.freemaps[size], start) + if len(f.freemaps[size]) == 0 { + delete(f.freemaps, size) + } +} + +// initial from pgids using when use hashmap version +// pgids must be sorted +func (f *freelist) init(pgids []pgid) { + if len(pgids) == 0 { + return + } + + size := uint64(1) + start := pgids[0] + + if !sort.SliceIsSorted([]pgid(pgids), func(i, j int) bool { return pgids[i] < pgids[j] }) { + panic("pgids not sorted") + } + + f.freemaps = make(map[uint64]pidSet) + f.forwardMap = make(map[pgid]uint64) + f.backwardMap = make(map[pgid]uint64) + + for i := 1; i < len(pgids); i++ { + // continuous page + if pgids[i] == pgids[i-1]+1 { + size++ + } else { + f.addSpan(start, size) + + size = 1 + start = pgids[i] + } + } + + // init the tail + if size != 0 && start != 0 { + f.addSpan(start, size) + } +} diff --git a/vendor/github.com/boltdb/bolt/node.go b/vendor/github.com/etcd-io/bbolt/node.go similarity index 99% rename from vendor/github.com/boltdb/bolt/node.go rename to vendor/github.com/etcd-io/bbolt/node.go index 159318b22..6c3fa553e 100644 --- a/vendor/github.com/boltdb/bolt/node.go +++ b/vendor/github.com/etcd-io/bbolt/node.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "bytes" @@ -365,7 +365,7 @@ func (n *node) spill() error { } // Allocate contiguous space for the node. - p, err := tx.allocate((node.size() / tx.db.pageSize) + 1) + p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize) if err != nil { return err } diff --git a/vendor/github.com/boltdb/bolt/page.go b/vendor/github.com/etcd-io/bbolt/page.go similarity index 99% rename from vendor/github.com/boltdb/bolt/page.go rename to vendor/github.com/etcd-io/bbolt/page.go index cde403ae8..bca9615f0 100644 --- a/vendor/github.com/boltdb/bolt/page.go +++ b/vendor/github.com/etcd-io/bbolt/page.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "fmt" diff --git a/vendor/github.com/boltdb/bolt/tx.go b/vendor/github.com/etcd-io/bbolt/tx.go similarity index 94% rename from vendor/github.com/boltdb/bolt/tx.go rename to vendor/github.com/etcd-io/bbolt/tx.go index 6700308a2..f50864142 100644 --- a/vendor/github.com/boltdb/bolt/tx.go +++ b/vendor/github.com/etcd-io/bbolt/tx.go @@ -1,4 +1,4 @@ -package bolt +package bbolt import ( "fmt" @@ -126,10 +126,7 @@ func (tx *Tx) DeleteBucket(name []byte) error { // the error is returned to the caller. func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error { return tx.root.ForEach(func(k, v []byte) error { - if err := fn(k, tx.root.Bucket(k)); err != nil { - return err - } - return nil + return fn(k, tx.root.Bucket(k)) }) } @@ -169,28 +166,18 @@ func (tx *Tx) Commit() error { // Free the old root bucket. tx.meta.root.root = tx.root.root - opgid := tx.meta.pgid - - // Free the freelist and allocate new pages for it. This will overestimate - // the size of the freelist but not underestimate the size (which would be bad). - tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist)) - p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1) - if err != nil { - tx.rollback() - return err - } - if err := tx.db.freelist.write(p); err != nil { - tx.rollback() - return err + // Free the old freelist because commit writes out a fresh freelist. + if tx.meta.freelist != pgidNoFreelist { + tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist)) } - tx.meta.freelist = p.id - // If the high water mark has moved up then attempt to grow the database. - if tx.meta.pgid > opgid { - if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil { - tx.rollback() + if !tx.db.NoFreelistSync { + err := tx.commitFreelist() + if err != nil { return err } + } else { + tx.meta.freelist = pgidNoFreelist } // Write dirty pages to disk. @@ -235,6 +222,31 @@ func (tx *Tx) Commit() error { return nil } +func (tx *Tx) commitFreelist() error { + // Allocate new pages for the new free list. This will overestimate + // the size of the freelist but not underestimate the size (which would be bad). + opgid := tx.meta.pgid + p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1) + if err != nil { + tx.rollback() + return err + } + if err := tx.db.freelist.write(p); err != nil { + tx.rollback() + return err + } + tx.meta.freelist = p.id + // If the high water mark has moved up then attempt to grow the database. + if tx.meta.pgid > opgid { + if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil { + tx.rollback() + return err + } + } + + return nil +} + // Rollback closes the transaction and ignores all previous updates. Read-only // transactions must be rolled back and not committed. func (tx *Tx) Rollback() error { @@ -291,7 +303,9 @@ func (tx *Tx) close() { } // Copy writes the entire database to a writer. -// This function exists for backwards compatibility. Use WriteTo() instead. +// This function exists for backwards compatibility. +// +// Deprecated; Use WriteTo() instead. func (tx *Tx) Copy(w io.Writer) error { _, err := tx.WriteTo(w) return err @@ -305,7 +319,11 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { if err != nil { return 0, err } - defer func() { _ = f.Close() }() + defer func() { + if cerr := f.Close(); err == nil { + err = cerr + } + }() // Generate a meta page. We use the same page data for both meta pages. buf := make([]byte, tx.db.pageSize) @@ -333,7 +351,7 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { } // Move past the meta pages in the file. - if _, err := f.Seek(int64(tx.db.pageSize*2), os.SEEK_SET); err != nil { + if _, err := f.Seek(int64(tx.db.pageSize*2), io.SeekStart); err != nil { return n, fmt.Errorf("seek: %s", err) } @@ -344,7 +362,7 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { return n, err } - return n, f.Close() + return n, nil } // CopyFile copies the entire database to file at the given path. @@ -379,6 +397,9 @@ func (tx *Tx) Check() <-chan error { } func (tx *Tx) check(ch chan error) { + // Force loading free list if opened in ReadOnly mode. + tx.db.loadFreelist() + // Check if any pages are double freed. freed := make(map[pgid]bool) all := make([]pgid, tx.db.freelist.count()) @@ -394,8 +415,10 @@ func (tx *Tx) check(ch chan error) { reachable := make(map[pgid]*page) reachable[0] = tx.page(0) // meta0 reachable[1] = tx.page(1) // meta1 - for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ { - reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist) + if tx.meta.freelist != pgidNoFreelist { + for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ { + reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist) + } } // Recursively check buckets. @@ -453,7 +476,7 @@ func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bo // allocate returns a contiguous block of memory starting at a given page. func (tx *Tx) allocate(count int) (*page, error) { - p, err := tx.db.allocate(count) + p, err := tx.db.allocate(tx.meta.txid, count) if err != nil { return nil, err } @@ -462,7 +485,7 @@ func (tx *Tx) allocate(count int) (*page, error) { tx.pages[p.id] = p // Update statistics. - tx.stats.PageCount++ + tx.stats.PageCount += count tx.stats.PageAlloc += count * tx.db.pageSize return p, nil