Update bleve dependency to latest master revision (#6100)

* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2

* remove unused pkg from dep file

* change bleve from master to recent revision
tokarchuk/v1.17
Lunny Xiao 6 years ago committed by techknowlogick
parent 11e316654e
commit a380cfd8e0
  1. 36
      Gopkg.lock
  2. 9
      Gopkg.toml
  3. 22
      vendor/github.com/Smerity/govarint/LICENSE
  4. 229
      vendor/github.com/Smerity/govarint/govarint.go
  5. 41
      vendor/github.com/blevesearch/bleve/analysis/freq.go
  6. 8
      vendor/github.com/blevesearch/bleve/analysis/token/camelcase/parser.go
  7. 2
      vendor/github.com/blevesearch/bleve/analysis/token/unique/unique.go
  8. 29
      vendor/github.com/blevesearch/bleve/document/document.go
  9. 2
      vendor/github.com/blevesearch/bleve/document/field.go
  10. 16
      vendor/github.com/blevesearch/bleve/document/field_boolean.go
  11. 25
      vendor/github.com/blevesearch/bleve/document/field_composite.go
  12. 15
      vendor/github.com/blevesearch/bleve/document/field_datetime.go
  13. 15
      vendor/github.com/blevesearch/bleve/document/field_geopoint.go
  14. 15
      vendor/github.com/blevesearch/bleve/document/field_numeric.go
  15. 16
      vendor/github.com/blevesearch/bleve/document/field_text.go
  16. 174
      vendor/github.com/blevesearch/bleve/geo/geohash.go
  17. 43
      vendor/github.com/blevesearch/bleve/geo/parse.go
  18. 35
      vendor/github.com/blevesearch/bleve/index.go
  19. 19
      vendor/github.com/blevesearch/bleve/index/analysis.go
  20. 120
      vendor/github.com/blevesearch/bleve/index/index.go
  21. 268
      vendor/github.com/blevesearch/bleve/index/scorch/introducer.go
  22. 144
      vendor/github.com/blevesearch/bleve/index/scorch/merge.go
  23. 23
      vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go
  24. 420
      vendor/github.com/blevesearch/bleve/index/scorch/optimize.go
  25. 273
      vendor/github.com/blevesearch/bleve/index/scorch/persister.go
  26. 110
      vendor/github.com/blevesearch/bleve/index/scorch/reader.go
  27. 235
      vendor/github.com/blevesearch/bleve/index/scorch/scorch.go
  28. 40
      vendor/github.com/blevesearch/bleve/index/scorch/segment/empty.go
  29. 321
      vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/build.go
  30. 103
      vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/dict.go
  31. 178
      vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/posting.go
  32. 289
      vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/segment.go
  33. 75
      vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go
  34. 43
      vendor/github.com/blevesearch/bleve/index/scorch/segment/segment.go
  35. 542
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/build.go
  36. 107
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/contentcoder.go
  37. 10
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/count.go
  38. 125
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/dict.go
  39. 238
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go
  40. 16
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/enumerator.go
  41. 79
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/intcoder.go
  42. 544
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/merge.go
  43. 826
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go
  44. 749
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/posting.go
  45. 232
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/segment.go
  46. 22
      vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/write.go
  47. 378
      vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go
  48. 17
      vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_dict.go
  49. 13
      vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_doc.go
  50. 77
      vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go
  51. 2
      vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go
  52. 140
      vendor/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go
  53. 154
      vendor/github.com/blevesearch/bleve/index/scorch/stats.go
  54. 2
      vendor/github.com/blevesearch/bleve/index/store/boltdb/iterator.go
  55. 2
      vendor/github.com/blevesearch/bleve/index/store/boltdb/reader.go
  56. 8
      vendor/github.com/blevesearch/bleve/index/store/boltdb/store.go
  57. 23
      vendor/github.com/blevesearch/bleve/index/upsidedown/index_reader.go
  58. 39
      vendor/github.com/blevesearch/bleve/index/upsidedown/reader.go
  59. 31
      vendor/github.com/blevesearch/bleve/index/upsidedown/row.go
  60. 9
      vendor/github.com/blevesearch/bleve/index/upsidedown/upsidedown.go
  61. 1
      vendor/github.com/blevesearch/bleve/index_alias_impl.go
  62. 105
      vendor/github.com/blevesearch/bleve/index_impl.go
  63. 3
      vendor/github.com/blevesearch/bleve/index_meta.go
  64. 12
      vendor/github.com/blevesearch/bleve/mapping/document.go
  65. 2
      vendor/github.com/blevesearch/bleve/mapping/index.go
  66. 3
      vendor/github.com/blevesearch/bleve/mapping/reflect.go
  67. 2
      vendor/github.com/blevesearch/bleve/numeric/bin.go
  68. 4
      vendor/github.com/blevesearch/bleve/numeric/prefix_coded.go
  69. 77
      vendor/github.com/blevesearch/bleve/search.go
  70. 20
      vendor/github.com/blevesearch/bleve/search/collector.go
  71. 4
      vendor/github.com/blevesearch/bleve/search/collector/heap.go
  72. 5
      vendor/github.com/blevesearch/bleve/search/collector/list.go
  73. 4
      vendor/github.com/blevesearch/bleve/search/collector/slice.go
  74. 105
      vendor/github.com/blevesearch/bleve/search/collector/topn.go
  75. 21
      vendor/github.com/blevesearch/bleve/search/explanation.go
  76. 29
      vendor/github.com/blevesearch/bleve/search/facet/facet_builder_datetime.go
  77. 29
      vendor/github.com/blevesearch/bleve/search/facet/facet_builder_numeric.go
  78. 21
      vendor/github.com/blevesearch/bleve/search/facet/facet_builder_terms.go
  79. 56
      vendor/github.com/blevesearch/bleve/search/facets_builder.go
  80. 17
      vendor/github.com/blevesearch/bleve/search/levenshtein.go
  81. 11
      vendor/github.com/blevesearch/bleve/search/pool.go
  82. 2
      vendor/github.com/blevesearch/bleve/search/query/conjunction.go
  83. 11
      vendor/github.com/blevesearch/bleve/search/query/disjunction.go
  84. 12
      vendor/github.com/blevesearch/bleve/search/query/query.go
  85. 1
      vendor/github.com/blevesearch/bleve/search/query/query_string_lex.go
  86. 35
      vendor/github.com/blevesearch/bleve/search/query/regexp.go
  87. 23
      vendor/github.com/blevesearch/bleve/search/query/wildcard.go
  88. 25
      vendor/github.com/blevesearch/bleve/search/scorer/scorer_conjunction.go
  89. 19
      vendor/github.com/blevesearch/bleve/search/scorer/scorer_constant.go
  90. 24
      vendor/github.com/blevesearch/bleve/search/scorer/scorer_disjunction.go
  91. 88
      vendor/github.com/blevesearch/bleve/search/scorer/scorer_term.go
  92. 155
      vendor/github.com/blevesearch/bleve/search/search.go
  93. 47
      vendor/github.com/blevesearch/bleve/search/searcher/search_boolean.go
  94. 57
      vendor/github.com/blevesearch/bleve/search/searcher/search_conjunction.go
  95. 259
      vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go
  96. 343
      vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
  97. 298
      vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
  98. 16
      vendor/github.com/blevesearch/bleve/search/searcher/search_docid.go
  99. 15
      vendor/github.com/blevesearch/bleve/search/searcher/search_filter.go
  100. 49
      vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go
  101. Some files were not shown because too many files have changed in this diff Show More

36
Gopkg.lock generated

@ -40,14 +40,6 @@
revision = "1a28a7fa985680f9f4e1644c0a857ec359a444b0" revision = "1a28a7fa985680f9f4e1644c0a857ec359a444b0"
version = "v0.4.7" version = "v0.4.7"
[[projects]]
branch = "master"
digest = "1:93367b6d47a8ccc7d14f9f493ccf103ccf5afb698559ff8e8f1999427ce27ace"
name = "github.com/Smerity/govarint"
packages = ["."]
pruneopts = "NUT"
revision = "7265e41f48f15fd61751e16da866af3c704bb3ab"
[[projects]] [[projects]]
branch = "master" branch = "master"
digest = "1:d290f4b25abbf574f80f60c8a5603ddada784f13f436b91a9a927bc7ce5a0146" digest = "1:d290f4b25abbf574f80f60c8a5603ddada784f13f436b91a9a927bc7ce5a0146"
@ -98,7 +90,8 @@
revision = "3a771d992973f24aa725d07868b467d1ddfceafb" revision = "3a771d992973f24aa725d07868b467d1ddfceafb"
[[projects]] [[projects]]
digest = "1:c10f35be6200b09e26da267ca80f837315093ecaba27e7a223071380efb9dd32" branch = "master"
digest = "1:b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2"
name = "github.com/blevesearch/bleve" name = "github.com/blevesearch/bleve"
packages = [ packages = [
".", ".",
@ -121,7 +114,6 @@
"index/scorch", "index/scorch",
"index/scorch/mergeplan", "index/scorch/mergeplan",
"index/scorch/segment", "index/scorch/segment",
"index/scorch/segment/mem",
"index/scorch/segment/zap", "index/scorch/segment/zap",
"index/store", "index/store",
"index/store/boltdb", "index/store/boltdb",
@ -141,9 +133,10 @@
"search/query", "search/query",
"search/scorer", "search/scorer",
"search/searcher", "search/searcher",
"size",
] ]
pruneopts = "NUT" pruneopts = "NUT"
revision = "c74e08f039e56cef576e4336382b2a2d12d9e026" revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5"
[[projects]] [[projects]]
branch = "master" branch = "master"
@ -160,14 +153,6 @@
pruneopts = "NUT" pruneopts = "NUT"
revision = "db70c57796cc8c310613541dfade3dce627d09c7" revision = "db70c57796cc8c310613541dfade3dce627d09c7"
[[projects]]
digest = "1:c7e0968c05659f3973148cd5c5387d6ee960a6ae1b2eaaec0b1d435d806458bb"
name = "github.com/boltdb/bolt"
packages = ["."]
pruneopts = "NUT"
revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7"
source = "github.com/go-gitea/bolt"
[[projects]] [[projects]]
digest = "1:7c96cf7bf7f52af67f7a8222185813b9b665f5172ec2ac5f7d49ed96e5fcf3e5" digest = "1:7c96cf7bf7f52af67f7a8222185813b9b665f5172ec2ac5f7d49ed96e5fcf3e5"
name = "github.com/boombuler/barcode" name = "github.com/boombuler/barcode"
@ -217,15 +202,16 @@
[[projects]] [[projects]]
branch = "master" branch = "master"
digest = "1:82e1ad11d777f7bff9a1fc678a8a534a318f85e5026a8a4d6f4a94a6b0678bb6" digest = "1:6a658ac7d23204dc743c7155557c45273747d78e05ae0579742bd6b744bce215"
name = "github.com/couchbase/vellum" name = "github.com/couchbase/vellum"
packages = [ packages = [
".", ".",
"levenshtein2",
"regexp", "regexp",
"utf8", "utf8",
] ]
pruneopts = "NUT" pruneopts = "NUT"
revision = "eb6ae3743b3f300f2136f83ca78c08cc071edbd4" revision = "e91b68ff3efe3cc11723aa25dd315cbc9276cd65"
[[projects]] [[projects]]
branch = "master" branch = "master"
@ -287,6 +273,14 @@
revision = "1615341f118ae12f353cc8a983f35b584342c9b3" revision = "1615341f118ae12f353cc8a983f35b584342c9b3"
version = "v1.12.0" version = "v1.12.0"
[[projects]]
digest = "1:ae8eea1a24ae43a46c2e96631b6303fcc4210ca0ac9d643e4da965029d1b511d"
name = "github.com/etcd-io/bbolt"
packages = ["."]
pruneopts = "NUT"
revision = "63597a96ec0ad9e6d43c3fc81e809909e0237461"
version = "v1.3.2"
[[projects]] [[projects]]
digest = "1:8603f74d35c93b37c615a02ba297be2cf2efc9ff6f1ff2b458a903990b568e48" digest = "1:8603f74d35c93b37c615a02ba297be2cf2efc9ff6f1ff2b458a903990b568e48"
name = "github.com/ethantkoenig/rupture" name = "github.com/ethantkoenig/rupture"

@ -15,10 +15,8 @@ ignored = ["google.golang.org/appengine*"]
name = "code.gitea.io/sdk" name = "code.gitea.io/sdk"
[[constraint]] [[constraint]]
# branch = "master" revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5"
revision = "c74e08f039e56cef576e4336382b2a2d12d9e026"
name = "github.com/blevesearch/bleve" name = "github.com/blevesearch/bleve"
#Not targetting v0.7.0 since standard where use only just after this tag
[[constraint]] [[constraint]]
revision = "12dd70caea0268ac0d6c2707d0611ef601e7c64e" revision = "12dd70caea0268ac0d6c2707d0611ef601e7c64e"
@ -108,11 +106,6 @@ ignored = ["google.golang.org/appengine*"]
name = "gopkg.in/testfixtures.v2" name = "gopkg.in/testfixtures.v2"
version = "2.0.0" version = "2.0.0"
[[override]]
name = "github.com/boltdb/bolt"
revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7"
source = "github.com/go-gitea/bolt"
[[override]] [[override]]
branch = "master" branch = "master"
name = "golang.org/x/oauth2" name = "golang.org/x/oauth2"

@ -1,22 +0,0 @@
The MIT License (MIT)
Copyright (c) 2015 Stephen Merity
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -1,229 +0,0 @@
package govarint
import "encoding/binary"
import "io"
type U32VarintEncoder interface {
PutU32(x uint32) int
Close()
}
type U32VarintDecoder interface {
GetU32() (uint32, error)
}
///
type U64VarintEncoder interface {
PutU64(x uint64) int
Close()
}
type U64VarintDecoder interface {
GetU64() (uint64, error)
}
///
type U32GroupVarintEncoder struct {
w io.Writer
index int
store [4]uint32
temp [17]byte
}
func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} }
func (b *U32GroupVarintEncoder) Flush() (int, error) {
// TODO: Is it more efficient to have a tailored version that's called only in Close()?
// If index is zero, there are no integers to flush
if b.index == 0 {
return 0, nil
}
// In the case we're flushing (the group isn't of size four), the non-values should be zero
// This ensures the unused entries are all zero in the sizeByte
for i := b.index; i < 4; i++ {
b.store[i] = 0
}
length := 1
// We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it
b.temp[0] = 0
for i, x := range b.store {
size := byte(0)
shifts := []byte{24, 16, 8, 0}
for _, shift := range shifts {
// Always writes at least one byte -- the first one (shift = 0)
// Will write more bytes until the rest of the integer is all zeroes
if (x>>shift) != 0 || shift == 0 {
size += 1
b.temp[length] = byte(x >> shift)
length += 1
}
}
// We store the size in two of the eight bits in the first byte (sizeByte)
// 0 means there is one byte in total, hence why we subtract one from size
b.temp[0] |= (size - 1) << (uint8(3-i) * 2)
}
// If we're flushing without a full group of four, remove the unused bytes we computed
// This enables us to realize it's a partial group on decoding thanks to EOF
if b.index != 4 {
length -= 4 - b.index
}
_, err := b.w.Write(b.temp[:length])
return length, err
}
func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) {
bytesWritten := 0
b.store[b.index] = x
b.index += 1
if b.index == 4 {
n, err := b.Flush()
if err != nil {
return n, err
}
bytesWritten += n
b.index = 0
}
return bytesWritten, nil
}
func (b *U32GroupVarintEncoder) Close() {
// On Close, we flush any remaining values that might not have been in a full group
b.Flush()
}
///
type U32GroupVarintDecoder struct {
r io.ByteReader
group [4]uint32
pos int
finished bool
capacity int
}
func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder {
return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4}
}
func (b *U32GroupVarintDecoder) getGroup() error {
// We should always receive a sizeByte if there are more values to read
sizeByte, err := b.r.ReadByte()
if err != nil {
return err
}
// Calculate the size of the four incoming 32 bit integers
// 0b00 means 1 byte to read, 0b01 = 2, etc
b.group[0] = uint32((sizeByte >> 6) & 3)
b.group[1] = uint32((sizeByte >> 4) & 3)
b.group[2] = uint32((sizeByte >> 2) & 3)
b.group[3] = uint32(sizeByte & 3)
//
for index, size := range b.group {
b.group[index] = 0
// Any error that occurs in earlier byte reads should be repeated at the end one
// Hence we only catch and report the final ReadByte's error
var err error
switch size {
case 0:
var x byte
x, err = b.r.ReadByte()
b.group[index] = uint32(x)
case 1:
var x, y byte
x, _ = b.r.ReadByte()
y, err = b.r.ReadByte()
b.group[index] = uint32(x)<<8 | uint32(y)
case 2:
var x, y, z byte
x, _ = b.r.ReadByte()
y, _ = b.r.ReadByte()
z, err = b.r.ReadByte()
b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z)
case 3:
var x, y, z, zz byte
x, _ = b.r.ReadByte()
y, _ = b.r.ReadByte()
z, _ = b.r.ReadByte()
zz, err = b.r.ReadByte()
b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz)
}
if err != nil {
if err == io.EOF {
// If we hit EOF here, we have found a partial group
// We've return any valid entries we have read and return EOF once we run out
b.capacity = index
b.finished = true
break
} else {
return err
}
}
}
// Reset the pos pointer to the beginning of the read values
b.pos = 0
return nil
}
func (b *U32GroupVarintDecoder) GetU32() (uint32, error) {
// Check if we have any more values to give out - if not, let's get them
if b.pos == b.capacity {
// If finished is set, there is nothing else to do
if b.finished {
return 0, io.EOF
}
err := b.getGroup()
if err != nil {
return 0, err
}
}
// Increment pointer and return the value stored at that point
b.pos += 1
return b.group[b.pos-1], nil
}
///
type Base128Encoder struct {
w io.Writer
tmpBytes []byte
}
func NewU32Base128Encoder(w io.Writer) *Base128Encoder {
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)}
}
func NewU64Base128Encoder(w io.Writer) *Base128Encoder {
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)}
}
func (b *Base128Encoder) PutU32(x uint32) (int, error) {
writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x))
return b.w.Write(b.tmpBytes[:writtenBytes])
}
func (b *Base128Encoder) PutU64(x uint64) (int, error) {
writtenBytes := binary.PutUvarint(b.tmpBytes, x)
return b.w.Write(b.tmpBytes[:writtenBytes])
}
func (b *Base128Encoder) Close() {
}
///
type Base128Decoder struct {
r io.ByteReader
}
func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} }
func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} }
func (b *Base128Decoder) GetU32() (uint32, error) {
v, err := binary.ReadUvarint(b.r)
return uint32(v), err
}
func (b *Base128Decoder) GetU64() (uint64, error) {
return binary.ReadUvarint(b.r)
}

@ -14,6 +14,22 @@
package analysis package analysis
import (
"reflect"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeTokenLocation int
var reflectStaticSizeTokenFreq int
func init() {
var tl TokenLocation
reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size())
var tf TokenFreq
reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size())
}
// TokenLocation represents one occurrence of a term at a particular location in // TokenLocation represents one occurrence of a term at a particular location in
// a field. Start, End and Position have the same meaning as in analysis.Token. // a field. Start, End and Position have the same meaning as in analysis.Token.
// Field and ArrayPositions identify the field value in the source document. // Field and ArrayPositions identify the field value in the source document.
@ -26,6 +42,12 @@ type TokenLocation struct {
Position int Position int
} }
func (tl *TokenLocation) Size() int {
rv := reflectStaticSizeTokenLocation
rv += len(tl.ArrayPositions) * size.SizeOfUint64
return rv
}
// TokenFreq represents all the occurrences of a term in all fields of a // TokenFreq represents all the occurrences of a term in all fields of a
// document. // document.
type TokenFreq struct { type TokenFreq struct {
@ -34,6 +56,15 @@ type TokenFreq struct {
frequency int frequency int
} }
func (tf *TokenFreq) Size() int {
rv := reflectStaticSizeTokenFreq
rv += len(tf.Term)
for _, loc := range tf.Locations {
rv += loc.Size()
}
return rv
}
func (tf *TokenFreq) Frequency() int { func (tf *TokenFreq) Frequency() int {
return tf.frequency return tf.frequency
} }
@ -42,6 +73,16 @@ func (tf *TokenFreq) Frequency() int {
// fields. // fields.
type TokenFrequencies map[string]*TokenFreq type TokenFrequencies map[string]*TokenFreq
func (tfs TokenFrequencies) Size() int {
rv := size.SizeOfMap
rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr)
for k, v := range tfs {
rv += len(k)
rv += v.Size()
}
return rv
}
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) { func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
// walk the new token frequencies // walk the new token frequencies
for tfk, tf := range other { for tfk, tf := range other {

@ -46,11 +46,11 @@ type Parser struct {
index int index int
} }
func NewParser(len, position, index int) *Parser { func NewParser(length, position, index int) *Parser {
return &Parser{ return &Parser{
bufferLen: len, bufferLen: length,
buffer: make([]rune, 0, len), buffer: make([]rune, 0, length),
tokens: make([]*analysis.Token, 0, len), tokens: make([]*analysis.Token, 0, length),
position: position, position: position,
index: index, index: index,
} }

@ -21,7 +21,7 @@ import (
const Name = "unique" const Name = "unique"
// UniqueTermFilter retains only the tokens which mark the first occurence of // UniqueTermFilter retains only the tokens which mark the first occurrence of
// a term. Tokens whose term appears in a preceding token are dropped. // a term. Tokens whose term appears in a preceding token are dropped.
type UniqueTermFilter struct{} type UniqueTermFilter struct{}

@ -14,7 +14,19 @@
package document package document
import "fmt" import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDocument int
func init() {
var d Document
reflectStaticSizeDocument = int(reflect.TypeOf(d).Size())
}
type Document struct { type Document struct {
ID string `json:"id"` ID string `json:"id"`
@ -30,6 +42,21 @@ func NewDocument(id string) *Document {
} }
} }
func (d *Document) Size() int {
sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr +
len(d.ID)
for _, entry := range d.Fields {
sizeInBytes += entry.Size()
}
for _, entry := range d.CompositeFields {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
func (d *Document) AddField(f Field) *Document { func (d *Document) AddField(f Field) *Document {
switch f := f.(type) { switch f := f.(type) {
case *CompositeField: case *CompositeField:

@ -36,4 +36,6 @@ type Field interface {
// that this field represents - this is a common metric for tracking // that this field represents - this is a common metric for tracking
// the rate of indexing // the rate of indexing
NumPlainTextBytes() uint64 NumPlainTextBytes() uint64
Size() int
} }

@ -16,10 +16,19 @@ package document
import ( import (
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeBooleanField int
func init() {
var f BooleanField
reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size())
}
const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues
type BooleanField struct { type BooleanField struct {
@ -30,6 +39,13 @@ type BooleanField struct {
numPlainTextBytes uint64 numPlainTextBytes uint64
} }
func (b *BooleanField) Size() int {
return reflectStaticSizeBooleanField + size.SizeOfPtr +
len(b.name) +
len(b.arrayPositions)*size.SizeOfUint64 +
len(b.value)
}
func (b *BooleanField) Name() string { func (b *BooleanField) Name() string {
return b.name return b.name
} }

@ -15,9 +15,19 @@
package document package document
import ( import (
"reflect"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeCompositeField int
func init() {
var cf CompositeField
reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size())
}
const DefaultCompositeIndexingOptions = IndexField const DefaultCompositeIndexingOptions = IndexField
type CompositeField struct { type CompositeField struct {
@ -54,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl
return rv return rv
} }
func (c *CompositeField) Size() int {
sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr +
len(c.name)
for k, _ := range c.includedFields {
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
}
for k, _ := range c.excludedFields {
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
}
return sizeInBytes
}
func (c *CompositeField) Name() string { func (c *CompositeField) Name() string {
return c.name return c.name
} }

@ -17,12 +17,21 @@ package document
import ( import (
"fmt" "fmt"
"math" "math"
"reflect"
"time" "time"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeDateTimeField int
func init() {
var f DateTimeField
reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size())
}
const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues
const DefaultDateTimePrecisionStep uint = 4 const DefaultDateTimePrecisionStep uint = 4
@ -37,6 +46,12 @@ type DateTimeField struct {
numPlainTextBytes uint64 numPlainTextBytes uint64
} }
func (n *DateTimeField) Size() int {
return reflectStaticSizeDateTimeField + size.SizeOfPtr +
len(n.name) +
len(n.arrayPositions)*size.SizeOfUint64
}
func (n *DateTimeField) Name() string { func (n *DateTimeField) Name() string {
return n.name return n.name
} }

@ -16,12 +16,21 @@ package document
import ( import (
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/geo" "github.com/blevesearch/bleve/geo"
"github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeGeoPointField int
func init() {
var f GeoPointField
reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size())
}
var GeoPrecisionStep uint = 9 var GeoPrecisionStep uint = 9
type GeoPointField struct { type GeoPointField struct {
@ -32,6 +41,12 @@ type GeoPointField struct {
numPlainTextBytes uint64 numPlainTextBytes uint64
} }
func (n *GeoPointField) Size() int {
return reflectStaticSizeGeoPointField + size.SizeOfPtr +
len(n.name) +
len(n.arrayPositions)*size.SizeOfUint64
}
func (n *GeoPointField) Name() string { func (n *GeoPointField) Name() string {
return n.name return n.name
} }

@ -16,11 +16,20 @@ package document
import ( import (
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeNumericField int
func init() {
var f NumericField
reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size())
}
const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues
const DefaultPrecisionStep uint = 4 const DefaultPrecisionStep uint = 4
@ -33,6 +42,12 @@ type NumericField struct {
numPlainTextBytes uint64 numPlainTextBytes uint64
} }
func (n *NumericField) Size() int {
return reflectStaticSizeNumericField + size.SizeOfPtr +
len(n.name) +
len(n.arrayPositions)*size.SizeOfPtr
}
func (n *NumericField) Name() string { func (n *NumericField) Name() string {
return n.name return n.name
} }

@ -16,10 +16,19 @@ package document
import ( import (
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeTextField int
func init() {
var f TextField
reflectStaticSizeTextField = int(reflect.TypeOf(f).Size())
}
const DefaultTextIndexingOptions = IndexField | DocValues const DefaultTextIndexingOptions = IndexField | DocValues
type TextField struct { type TextField struct {
@ -31,6 +40,13 @@ type TextField struct {
numPlainTextBytes uint64 numPlainTextBytes uint64
} }
func (t *TextField) Size() int {
return reflectStaticSizeTextField + size.SizeOfPtr +
len(t.name) +
len(t.arrayPositions)*size.SizeOfUint64 +
len(t.value)
}
func (t *TextField) Name() string { func (t *TextField) Name() string {
return t.name return t.name
} }

@ -0,0 +1,174 @@
// The code here was obtained from:
// https://github.com/mmcloughlin/geohash
// The MIT License (MIT)
// Copyright (c) 2015 Michael McLoughlin
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
package geo
import (
"math"
)
// encoding encapsulates an encoding defined by a given base32 alphabet.
type encoding struct {
enc string
dec [256]byte
}
// newEncoding constructs a new encoding defined by the given alphabet,
// which must be a 32-byte string.
func newEncoding(encoder string) *encoding {
e := new(encoding)
e.enc = encoder
for i := 0; i < len(e.dec); i++ {
e.dec[i] = 0xff
}
for i := 0; i < len(encoder); i++ {
e.dec[encoder[i]] = byte(i)
}
return e
}
// Decode string into bits of a 64-bit word. The string s may be at most 12
// characters.
func (e *encoding) decode(s string) uint64 {
x := uint64(0)
for i := 0; i < len(s); i++ {
x = (x << 5) | uint64(e.dec[s[i]])
}
return x
}
// Encode bits of 64-bit word into a string.
func (e *encoding) encode(x uint64) string {
b := [12]byte{}
for i := 0; i < 12; i++ {
b[11-i] = e.enc[x&0x1f]
x >>= 5
}
return string(b[:])
}
// Base32Encoding with the Geohash alphabet.
var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz")
// BoundingBox returns the region encoded by the given string geohash.
func geoBoundingBox(hash string) geoBox {
bits := uint(5 * len(hash))
inthash := base32encoding.decode(hash)
return geoBoundingBoxIntWithPrecision(inthash, bits)
}
// Box represents a rectangle in latitude/longitude space.
type geoBox struct {
minLat float64
maxLat float64
minLng float64
maxLng float64
}
// Round returns a point inside the box, making an effort to round to minimal
// precision.
func (b geoBox) round() (lat, lng float64) {
x := maxDecimalPower(b.maxLat - b.minLat)
lat = math.Ceil(b.minLat/x) * x
x = maxDecimalPower(b.maxLng - b.minLng)
lng = math.Ceil(b.minLng/x) * x
return
}
// precalculated for performance
var exp232 = math.Exp2(32)
// errorWithPrecision returns the error range in latitude and longitude for in
// integer geohash with bits of precision.
func errorWithPrecision(bits uint) (latErr, lngErr float64) {
b := int(bits)
latBits := b / 2
lngBits := b - latBits
latErr = math.Ldexp(180.0, -latBits)
lngErr = math.Ldexp(360.0, -lngBits)
return
}
// minDecimalPlaces returns the minimum number of decimal places such that
// there must exist an number with that many places within any range of width
// r. This is intended for returning minimal precision coordinates inside a
// box.
func maxDecimalPower(r float64) float64 {
m := int(math.Floor(math.Log10(r)))
return math.Pow10(m)
}
// Encode the position of x within the range -r to +r as a 32-bit integer.
func encodeRange(x, r float64) uint32 {
p := (x + r) / (2 * r)
return uint32(p * exp232)
}
// Decode the 32-bit range encoding X back to a value in the range -r to +r.
func decodeRange(X uint32, r float64) float64 {
p := float64(X) / exp232
x := 2*r*p - r
return x
}
// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are
// ignored, and may take any value.
func squash(X uint64) uint32 {
X &= 0x5555555555555555
X = (X | (X >> 1)) & 0x3333333333333333
X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f
X = (X | (X >> 4)) & 0x00ff00ff00ff00ff
X = (X | (X >> 8)) & 0x0000ffff0000ffff
X = (X | (X >> 16)) & 0x00000000ffffffff
return uint32(X)
}
// Deinterleave the bits of X into 32-bit words containing the even and odd
// bitlevels of X, respectively.
func deinterleave(X uint64) (uint32, uint32) {
return squash(X), squash(X >> 1)
}
// BoundingBoxIntWithPrecision returns the region encoded by the integer
// geohash with the specified precision.
func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox {
fullHash := hash << (64 - bits)
latInt, lngInt := deinterleave(fullHash)
lat := decodeRange(latInt, 90)
lng := decodeRange(lngInt, 180)
latErr, lngErr := errorWithPrecision(bits)
return geoBox{
minLat: lat,
maxLat: lat + latErr,
minLng: lng,
maxLng: lng + lngErr,
}
}
// ----------------------------------------------------------------------
// Decode the string geohash to a (lat, lng) point.
func GeoHashDecode(hash string) (lat, lng float64) {
box := geoBoundingBox(hash)
return box.round()
}

@ -16,6 +16,7 @@ package geo
import ( import (
"reflect" "reflect"
"strconv"
"strings" "strings"
) )
@ -24,6 +25,8 @@ import (
// Container: // Container:
// slice length 2 (GeoJSON) // slice length 2 (GeoJSON)
// first element lon, second element lat // first element lon, second element lat
// string (coordinates separated by comma, or a geohash)
// first element lat, second element lon
// map[string]interface{} // map[string]interface{}
// exact keys lat and lon or lng // exact keys lat and lon or lng
// struct // struct
@ -36,10 +39,14 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
var foundLon, foundLat bool var foundLon, foundLat bool
thingVal := reflect.ValueOf(thing) thingVal := reflect.ValueOf(thing)
if !thingVal.IsValid() {
return lon, lat, false
}
thingTyp := thingVal.Type() thingTyp := thingVal.Type()
// is it a slice // is it a slice
if thingVal.IsValid() && thingVal.Kind() == reflect.Slice { if thingVal.Kind() == reflect.Slice {
// must be length 2 // must be length 2
if thingVal.Len() == 2 { if thingVal.Len() == 2 {
first := thingVal.Index(0) first := thingVal.Index(0)
@ -55,6 +62,35 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
} }
} }
// is it a string
if thingVal.Kind() == reflect.String {
geoStr := thingVal.Interface().(string)
if strings.Contains(geoStr, ",") {
// geo point with coordinates split by comma
points := strings.Split(geoStr, ",")
for i, point := range points {
// trim any leading or trailing white spaces
points[i] = strings.TrimSpace(point)
}
if len(points) == 2 {
var err error
lat, err = strconv.ParseFloat(points[0], 64)
if err == nil {
foundLat = true
}
lon, err = strconv.ParseFloat(points[1], 64)
if err == nil {
foundLon = true
}
}
} else {
// geohash
lat, lon = GeoHashDecode(geoStr)
foundLat = true
foundLon = true
}
}
// is it a map // is it a map
if l, ok := thing.(map[string]interface{}); ok { if l, ok := thing.(map[string]interface{}); ok {
if lval, ok := l["lon"]; ok { if lval, ok := l["lon"]; ok {
@ -68,7 +104,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
} }
// now try reflection on struct fields // now try reflection on struct fields
if thingVal.IsValid() && thingVal.Kind() == reflect.Struct { if thingVal.Kind() == reflect.Struct {
for i := 0; i < thingVal.NumField(); i++ { for i := 0; i < thingVal.NumField(); i++ {
fieldName := thingTyp.Field(i).Name fieldName := thingTyp.Field(i).Name
if strings.HasPrefix(strings.ToLower(fieldName), "lon") { if strings.HasPrefix(strings.ToLower(fieldName), "lon") {
@ -113,6 +149,9 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
// extract numeric value (if possible) and returns a float64 // extract numeric value (if possible) and returns a float64
func extractNumericVal(v interface{}) (float64, bool) { func extractNumericVal(v interface{}) (float64, bool) {
val := reflect.ValueOf(v) val := reflect.ValueOf(v)
if !val.IsValid() {
return 0, false
}
typ := val.Type() typ := val.Type()
switch typ.Kind() { switch typ.Kind() {
case reflect.Float32, reflect.Float64: case reflect.Float32, reflect.Float64:

@ -21,6 +21,7 @@ import (
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/size"
) )
// A Batch groups together multiple Index and Delete // A Batch groups together multiple Index and Delete
@ -32,6 +33,9 @@ import (
type Batch struct { type Batch struct {
index Index index Index
internal *index.Batch internal *index.Batch
lastDocSize uint64
totalSize uint64
} }
// Index adds the specified index operation to the // Index adds the specified index operation to the
@ -47,9 +51,22 @@ func (b *Batch) Index(id string, data interface{}) error {
return err return err
} }
b.internal.Update(doc) b.internal.Update(doc)
b.lastDocSize = uint64(doc.Size() +
len(id) + size.SizeOfString) // overhead from internal
b.totalSize += b.lastDocSize
return nil return nil
} }
func (b *Batch) LastDocSize() uint64 {
return b.lastDocSize
}
func (b *Batch) TotalDocsSize() uint64 {
return b.totalSize
}
// IndexAdvanced adds the specified index operation to the // IndexAdvanced adds the specified index operation to the
// batch which skips the mapping. NOTE: the bleve Index is not updated // batch which skips the mapping. NOTE: the bleve Index is not updated
// until the batch is executed. // until the batch is executed.
@ -102,6 +119,24 @@ func (b *Batch) Reset() {
b.internal.Reset() b.internal.Reset()
} }
func (b *Batch) Merge(o *Batch) {
if o != nil && o.internal != nil {
b.internal.Merge(o.internal)
if o.LastDocSize() > 0 {
b.lastDocSize = o.LastDocSize()
}
b.totalSize = uint64(b.internal.TotalDocSize())
}
}
func (b *Batch) SetPersistedCallback(f index.BatchCallback) {
b.internal.SetPersistedCallback(f)
}
func (b *Batch) PersistedCallback() index.BatchCallback {
return b.internal.PersistedCallback()
}
// An Index implements all the indexing and searching // An Index implements all the indexing and searching
// capabilities of bleve. An Index can be created // capabilities of bleve. An Index can be created
// using the New() and Open() methods. // using the New() and Open() methods.

@ -15,10 +15,20 @@
package index package index
import ( import (
"reflect"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeAnalysisResult int
func init() {
var ar AnalysisResult
reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size())
}
type IndexRow interface { type IndexRow interface {
KeySize() int KeySize() int
KeyTo([]byte) (int, error) KeyTo([]byte) (int, error)
@ -39,6 +49,15 @@ type AnalysisResult struct {
Length []int Length []int
} }
func (a *AnalysisResult) Size() int {
rv := reflectStaticSizeAnalysisResult
for _, analyzedI := range a.Analyzed {
rv += analyzedI.Size()
}
rv += len(a.Length) * size.SizeOfInt
return rv
}
type AnalysisWork struct { type AnalysisWork struct {
i Index i Index
d *document.Document d *document.Document

@ -18,11 +18,23 @@ import (
"bytes" "bytes"
"encoding/json" "encoding/json"
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeTermFieldDoc int
var reflectStaticSizeTermFieldVector int
func init() {
var tfd TermFieldDoc
reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
var tfv TermFieldVector
reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
}
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type") var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
type Index interface { type Index interface {
@ -68,6 +80,8 @@ type IndexReader interface {
Document(id string) (*document.Document, error) Document(id string) (*document.Document, error)
DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
DocValueReader(fields []string) (DocValueReader, error)
Fields() ([]string, error) Fields() ([]string, error)
GetInternal(key []byte) ([]byte, error) GetInternal(key []byte) ([]byte, error)
@ -84,6 +98,29 @@ type IndexReader interface {
Close() error Close() error
} }
// The Regexp interface defines the subset of the regexp.Regexp API
// methods that are used by bleve indexes, allowing callers to pass in
// alternate implementations.
type Regexp interface {
FindStringIndex(s string) (loc []int)
LiteralPrefix() (prefix string, complete bool)
String() string
}
type IndexReaderRegexp interface {
FieldDictRegexp(field string, regex string) (FieldDict, error)
}
type IndexReaderFuzzy interface {
FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
}
type IndexReaderOnly interface {
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
}
// FieldTerms contains the terms used by a document, keyed by field // FieldTerms contains the terms used by a document, keyed by field
type FieldTerms map[string][]string type FieldTerms map[string][]string
@ -115,6 +152,11 @@ type TermFieldVector struct {
End uint64 End uint64
} }
func (tfv *TermFieldVector) Size() int {
return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
}
// IndexInternalID is an opaque document identifier interal to the index impl // IndexInternalID is an opaque document identifier interal to the index impl
type IndexInternalID []byte type IndexInternalID []byte
@ -134,14 +176,27 @@ type TermFieldDoc struct {
Vectors []*TermFieldVector Vectors []*TermFieldVector
} }
func (tfd *TermFieldDoc) Size() int {
sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
len(tfd.Term) + len(tfd.ID)
for _, entry := range tfd.Vectors {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
// Reset allows an already allocated TermFieldDoc to be reused // Reset allows an already allocated TermFieldDoc to be reused
func (tfd *TermFieldDoc) Reset() *TermFieldDoc { func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
// remember the []byte used for the ID // remember the []byte used for the ID
id := tfd.ID id := tfd.ID
vectors := tfd.Vectors
// idiom to copy over from empty TermFieldDoc (0 allocations) // idiom to copy over from empty TermFieldDoc (0 allocations)
*tfd = TermFieldDoc{} *tfd = TermFieldDoc{}
// reuse the []byte already allocated (and reset len to 0) // reuse the []byte already allocated (and reset len to 0)
tfd.ID = id[:0] tfd.ID = id[:0]
tfd.Vectors = vectors[:0]
return tfd return tfd
} }
@ -161,6 +216,8 @@ type TermFieldReader interface {
// Count returns the number of documents contains the term in this field. // Count returns the number of documents contains the term in this field.
Count() uint64 Count() uint64
Close() error Close() error
Size() int
} }
type DictEntry struct { type DictEntry struct {
@ -185,12 +242,18 @@ type DocIDReader interface {
// will start there instead. If ID is greater than or equal to the end of // will start there instead. If ID is greater than or equal to the end of
// the range, Next() call will return io.EOF. // the range, Next() call will return io.EOF.
Advance(ID IndexInternalID) (IndexInternalID, error) Advance(ID IndexInternalID) (IndexInternalID, error)
Size() int
Close() error Close() error
} }
type BatchCallback func(error)
type Batch struct { type Batch struct {
IndexOps map[string]*document.Document IndexOps map[string]*document.Document
InternalOps map[string][]byte InternalOps map[string][]byte
persistedCallback BatchCallback
} }
func NewBatch() *Batch { func NewBatch() *Batch {
@ -216,6 +279,14 @@ func (b *Batch) DeleteInternal(key []byte) {
b.InternalOps[string(key)] = nil b.InternalOps[string(key)] = nil
} }
func (b *Batch) SetPersistedCallback(f BatchCallback) {
b.persistedCallback = f
}
func (b *Batch) PersistedCallback() BatchCallback {
return b.persistedCallback
}
func (b *Batch) String() string { func (b *Batch) String() string {
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps)) rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
for k, v := range b.IndexOps { for k, v := range b.IndexOps {
@ -238,4 +309,53 @@ func (b *Batch) String() string {
func (b *Batch) Reset() { func (b *Batch) Reset() {
b.IndexOps = make(map[string]*document.Document) b.IndexOps = make(map[string]*document.Document)
b.InternalOps = make(map[string][]byte) b.InternalOps = make(map[string][]byte)
b.persistedCallback = nil
}
func (b *Batch) Merge(o *Batch) {
for k, v := range o.IndexOps {
b.IndexOps[k] = v
}
for k, v := range o.InternalOps {
b.InternalOps[k] = v
}
}
func (b *Batch) TotalDocSize() int {
var s int
for k, v := range b.IndexOps {
if v != nil {
s += v.Size() + size.SizeOfString
}
s += len(k)
}
return s
}
// Optimizable represents an optional interface that implementable by
// optimizable resources (e.g., TermFieldReaders, Searchers). These
// optimizable resources are provided the same OptimizableContext
// instance, so that they can coordinate via dynamic interface
// casting.
type Optimizable interface {
Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
}
// Represents a result of optimization -- see the Finish() method.
type Optimized interface{}
type OptimizableContext interface {
// Once all the optimzable resources have been provided the same
// OptimizableContext instance, the optimization preparations are
// finished or completed via the Finish() method.
//
// Depending on the optimization being performed, the Finish()
// method might return a non-nil Optimized instance. For example,
// the Optimized instance might represent an optimized
// TermFieldReader instance.
Finish() (Optimized, error)
}
type DocValueReader interface {
VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
} }

@ -19,7 +19,9 @@ import (
"sync/atomic" "sync/atomic"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
) )
type segmentIntroduction struct { type segmentIntroduction struct {
@ -31,6 +33,12 @@ type segmentIntroduction struct {
applied chan error applied chan error
persisted chan error persisted chan error
persistedCallback index.BatchCallback
}
type persistIntroduction struct {
persisted map[uint64]segment.Segment
applied notificationChan
} }
type epochWatcher struct { type epochWatcher struct {
@ -48,6 +56,8 @@ func (s *Scorch) mainLoop() {
var epochWatchers []*epochWatcher var epochWatchers []*epochWatcher
OUTER: OUTER:
for { for {
atomic.AddUint64(&s.stats.TotIntroduceLoop, 1)
select { select {
case <-s.closeCh: case <-s.closeCh:
break OUTER break OUTER
@ -64,6 +74,9 @@ OUTER:
continue OUTER continue OUTER
} }
case persist := <-s.persists:
s.introducePersist(persist)
case revertTo := <-s.revertToSnapshots: case revertTo := <-s.revertToSnapshots:
err := s.revertToSnapshot(revertTo) err := s.revertToSnapshot(revertTo)
if err != nil { if err != nil {
@ -92,32 +105,38 @@ OUTER:
} }
func (s *Scorch) introduceSegment(next *segmentIntroduction) error { func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
// acquire lock atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1)
s.rootLock.Lock() defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1)
nsegs := len(s.root.segment) s.rootLock.RLock()
root := s.root
root.AddRef()
s.rootLock.RUnlock()
defer func() { _ = root.DecRef() }()
nsegs := len(root.segment)
// prepare new index snapshot // prepare new index snapshot
newSnapshot := &IndexSnapshot{ newSnapshot := &IndexSnapshot{
parent: s, parent: s,
segment: make([]*SegmentSnapshot, 0, nsegs+1), segment: make([]*SegmentSnapshot, 0, nsegs+1),
offsets: make([]uint64, 0, nsegs+1), offsets: make([]uint64, 0, nsegs+1),
internal: make(map[string][]byte, len(s.root.internal)), internal: make(map[string][]byte, len(root.internal)),
epoch: s.nextSnapshotEpoch,
refs: 1, refs: 1,
creator: "introduceSegment",
} }
s.nextSnapshotEpoch++
// iterate through current segments // iterate through current segments
var running uint64 var running uint64
for i := range s.root.segment { var docsToPersistCount, memSegments, fileSegments uint64
for i := range root.segment {
// see if optimistic work included this segment // see if optimistic work included this segment
delta, ok := next.obsoletes[s.root.segment[i].id] delta, ok := next.obsoletes[root.segment[i].id]
if !ok { if !ok {
var err error var err error
delta, err = s.root.segment[i].segment.DocNumbers(next.ids) delta, err = root.segment[i].segment.DocNumbers(next.ids)
if err != nil { if err != nil {
s.rootLock.Unlock()
next.applied <- fmt.Errorf("error computing doc numbers: %v", err) next.applied <- fmt.Errorf("error computing doc numbers: %v", err)
close(next.applied) close(next.applied)
_ = newSnapshot.DecRef() _ = newSnapshot.DecRef()
@ -126,43 +145,60 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
} }
newss := &SegmentSnapshot{ newss := &SegmentSnapshot{
id: s.root.segment[i].id, id: root.segment[i].id,
segment: s.root.segment[i].segment, segment: root.segment[i].segment,
cachedDocs: s.root.segment[i].cachedDocs, cachedDocs: root.segment[i].cachedDocs,
creator: root.segment[i].creator,
} }
// apply new obsoletions // apply new obsoletions
if s.root.segment[i].deleted == nil { if root.segment[i].deleted == nil {
newss.deleted = delta newss.deleted = delta
} else { } else {
newss.deleted = roaring.Or(s.root.segment[i].deleted, delta) newss.deleted = roaring.Or(root.segment[i].deleted, delta)
}
if newss.deleted.IsEmpty() {
newss.deleted = nil
} }
// check for live size before copying // check for live size before copying
if newss.LiveSize() > 0 { if newss.LiveSize() > 0 {
newSnapshot.segment = append(newSnapshot.segment, newss) newSnapshot.segment = append(newSnapshot.segment, newss)
s.root.segment[i].segment.AddRef() root.segment[i].segment.AddRef()
newSnapshot.offsets = append(newSnapshot.offsets, running) newSnapshot.offsets = append(newSnapshot.offsets, running)
running += s.root.segment[i].Count() running += newss.segment.Count()
}
if isMemorySegment(root.segment[i]) {
docsToPersistCount += root.segment[i].Count()
memSegments++
} else {
fileSegments++
} }
} }
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
// append new segment, if any, to end of the new index snapshot // append new segment, if any, to end of the new index snapshot
if next.data != nil { if next.data != nil {
newSegmentSnapshot := &SegmentSnapshot{ newSegmentSnapshot := &SegmentSnapshot{
id: next.id, id: next.id,
segment: next.data, // take ownership of next.data's ref-count segment: next.data, // take ownership of next.data's ref-count
cachedDocs: &cachedDocs{cache: nil}, cachedDocs: &cachedDocs{cache: nil},
creator: "introduceSegment",
} }
newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot) newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot)
newSnapshot.offsets = append(newSnapshot.offsets, running) newSnapshot.offsets = append(newSnapshot.offsets, running)
// increment numItemsIntroduced which tracks the number of items // increment numItemsIntroduced which tracks the number of items
// queued for persistence. // queued for persistence.
atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count()) atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count())
atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1)
} }
// copy old values // copy old values
for key, oldVal := range s.root.internal { for key, oldVal := range root.internal {
newSnapshot.internal[key] = oldVal newSnapshot.internal[key] = oldVal
} }
// set new values and apply deletes // set new values and apply deletes
@ -173,12 +209,21 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
delete(newSnapshot.internal, key) delete(newSnapshot.internal, key)
} }
} }
newSnapshot.updateSize()
s.rootLock.Lock()
if next.persisted != nil { if next.persisted != nil {
s.rootPersisted = append(s.rootPersisted, next.persisted) s.rootPersisted = append(s.rootPersisted, next.persisted)
} }
if next.persistedCallback != nil {
s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback)
}
// swap in new index snapshot // swap in new index snapshot
newSnapshot.epoch = s.nextSnapshotEpoch
s.nextSnapshotEpoch++
rootPrev := s.root rootPrev := s.root
s.root = newSnapshot s.root = newSnapshot
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
// release lock // release lock
s.rootLock.Unlock() s.rootLock.Unlock()
@ -191,42 +236,113 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
return nil return nil
} }
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { func (s *Scorch) introducePersist(persist *persistIntroduction) {
// acquire lock atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1)
s.rootLock.Lock() s.rootLock.Lock()
root := s.root
root.AddRef()
nextSnapshotEpoch := s.nextSnapshotEpoch
s.nextSnapshotEpoch++
s.rootLock.Unlock()
// prepare new index snapshot defer func() { _ = root.DecRef() }()
currSize := len(s.root.segment)
newSize := currSize + 1 - len(nextMerge.old) newIndexSnapshot := &IndexSnapshot{
parent: s,
epoch: nextSnapshotEpoch,
segment: make([]*SegmentSnapshot, len(root.segment)),
offsets: make([]uint64, len(root.offsets)),
internal: make(map[string][]byte, len(root.internal)),
refs: 1,
creator: "introducePersist",
}
var docsToPersistCount, memSegments, fileSegments uint64
for i, segmentSnapshot := range root.segment {
// see if this segment has been replaced
if replacement, ok := persist.persisted[segmentSnapshot.id]; ok {
newSegmentSnapshot := &SegmentSnapshot{
id: segmentSnapshot.id,
segment: replacement,
deleted: segmentSnapshot.deleted,
cachedDocs: segmentSnapshot.cachedDocs,
creator: "introducePersist",
}
newIndexSnapshot.segment[i] = newSegmentSnapshot
delete(persist.persisted, segmentSnapshot.id)
// update items persisted incase of a new segment snapshot
atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count())
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
fileSegments++
} else {
newIndexSnapshot.segment[i] = root.segment[i]
newIndexSnapshot.segment[i].segment.AddRef()
if isMemorySegment(root.segment[i]) {
docsToPersistCount += root.segment[i].Count()
memSegments++
} else {
fileSegments++
}
}
newIndexSnapshot.offsets[i] = root.offsets[i]
}
for k, v := range root.internal {
newIndexSnapshot.internal[k] = v
}
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
newIndexSnapshot.updateSize()
s.rootLock.Lock()
rootPrev := s.root
s.root = newIndexSnapshot
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
s.rootLock.Unlock()
if rootPrev != nil {
_ = rootPrev.DecRef()
}
// empty segments deletion close(persist.applied)
if nextMerge.new == nil {
newSize--
} }
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1)
s.rootLock.RLock()
root := s.root
root.AddRef()
s.rootLock.RUnlock()
defer func() { _ = root.DecRef() }()
newSnapshot := &IndexSnapshot{ newSnapshot := &IndexSnapshot{
parent: s, parent: s,
segment: make([]*SegmentSnapshot, 0, newSize), internal: root.internal,
offsets: make([]uint64, 0, newSize),
internal: s.root.internal,
epoch: s.nextSnapshotEpoch,
refs: 1, refs: 1,
creator: "introduceMerge",
} }
s.nextSnapshotEpoch++
// iterate through current segments // iterate through current segments
newSegmentDeleted := roaring.NewBitmap() newSegmentDeleted := roaring.NewBitmap()
var running uint64 var running, docsToPersistCount, memSegments, fileSegments uint64
for i := range s.root.segment { for i := range root.segment {
segmentID := s.root.segment[i].id segmentID := root.segment[i].id
if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok {
// this segment is going away, see if anything else was deleted since we started the merge // this segment is going away, see if anything else was deleted since we started the merge
if segSnapAtMerge != nil && s.root.segment[i].deleted != nil { if segSnapAtMerge != nil && root.segment[i].deleted != nil {
// assume all these deletes are new // assume all these deletes are new
deletedSince := s.root.segment[i].deleted deletedSince := root.segment[i].deleted
// if we already knew about some of them, remove // if we already knew about some of them, remove
if segSnapAtMerge.deleted != nil { if segSnapAtMerge.deleted != nil {
deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted) deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted)
} }
deletedSinceItr := deletedSince.Iterator() deletedSinceItr := deletedSince.Iterator()
for deletedSinceItr.HasNext() { for deletedSinceItr.HasNext() {
@ -240,18 +356,25 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
// segments left behind in old map after processing // segments left behind in old map after processing
// the root segments would be the obsolete segment set // the root segments would be the obsolete segment set
delete(nextMerge.old, segmentID) delete(nextMerge.old, segmentID)
} else if root.segment[i].LiveSize() > 0 {
} else if s.root.segment[i].LiveSize() > 0 {
// this segment is staying // this segment is staying
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
id: s.root.segment[i].id, id: root.segment[i].id,
segment: s.root.segment[i].segment, segment: root.segment[i].segment,
deleted: s.root.segment[i].deleted, deleted: root.segment[i].deleted,
cachedDocs: s.root.segment[i].cachedDocs, cachedDocs: root.segment[i].cachedDocs,
creator: root.segment[i].creator,
}) })
s.root.segment[i].segment.AddRef() root.segment[i].segment.AddRef()
newSnapshot.offsets = append(newSnapshot.offsets, running) newSnapshot.offsets = append(newSnapshot.offsets, running)
running += s.root.segment[i].Count() running += root.segment[i].segment.Count()
if isMemorySegment(root.segment[i]) {
docsToPersistCount += root.segment[i].Count()
memSegments++
} else {
fileSegments++
}
} }
} }
@ -269,6 +392,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
} }
} }
} }
// In case where all the docs in the newly merged segment getting // In case where all the docs in the newly merged segment getting
// deleted by the time we reach here, can skip the introduction. // deleted by the time we reach here, can skip the introduction.
if nextMerge.new != nil && if nextMerge.new != nil &&
@ -279,15 +403,35 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
segment: nextMerge.new, // take ownership for nextMerge.new's ref-count segment: nextMerge.new, // take ownership for nextMerge.new's ref-count
deleted: newSegmentDeleted, deleted: newSegmentDeleted,
cachedDocs: &cachedDocs{cache: nil}, cachedDocs: &cachedDocs{cache: nil},
creator: "introduceMerge",
}) })
newSnapshot.offsets = append(newSnapshot.offsets, running) newSnapshot.offsets = append(newSnapshot.offsets, running)
atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1)
switch nextMerge.new.(type) {
case *zap.SegmentBase:
docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality()
memSegments++
case *zap.Segment:
fileSegments++
}
} }
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response newSnapshot.AddRef() // 1 ref for the nextMerge.notify response
// swap in new segment newSnapshot.updateSize()
s.rootLock.Lock()
// swap in new index snapshot
newSnapshot.epoch = s.nextSnapshotEpoch
s.nextSnapshotEpoch++
rootPrev := s.root rootPrev := s.root
s.root = newSnapshot s.root = newSnapshot
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
// release lock // release lock
s.rootLock.Unlock() s.rootLock.Unlock()
@ -301,6 +445,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
} }
func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1)
if revertTo.snapshot == nil { if revertTo.snapshot == nil {
err := fmt.Errorf("Cannot revert to a nil snapshot") err := fmt.Errorf("Cannot revert to a nil snapshot")
revertTo.applied <- err revertTo.applied <- err
@ -318,9 +465,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
internal: revertTo.snapshot.internal, internal: revertTo.snapshot.internal,
epoch: s.nextSnapshotEpoch, epoch: s.nextSnapshotEpoch,
refs: 1, refs: 1,
creator: "revertToSnapshot",
} }
s.nextSnapshotEpoch++ s.nextSnapshotEpoch++
var docsToPersistCount, memSegments, fileSegments uint64
// iterate through segments // iterate through segments
for i, segmentSnapshot := range revertTo.snapshot.segment { for i, segmentSnapshot := range revertTo.snapshot.segment {
newSnapshot.segment[i] = &SegmentSnapshot{ newSnapshot.segment[i] = &SegmentSnapshot{
@ -328,21 +477,37 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
segment: segmentSnapshot.segment, segment: segmentSnapshot.segment,
deleted: segmentSnapshot.deleted, deleted: segmentSnapshot.deleted,
cachedDocs: segmentSnapshot.cachedDocs, cachedDocs: segmentSnapshot.cachedDocs,
creator: segmentSnapshot.creator,
} }
newSnapshot.segment[i].segment.AddRef() newSnapshot.segment[i].segment.AddRef()
// remove segment from ineligibleForRemoval map // remove segment from ineligibleForRemoval map
filename := zapFileName(segmentSnapshot.id) filename := zapFileName(segmentSnapshot.id)
delete(s.ineligibleForRemoval, filename) delete(s.ineligibleForRemoval, filename)
if isMemorySegment(segmentSnapshot) {
docsToPersistCount += segmentSnapshot.Count()
memSegments++
} else {
fileSegments++
} }
}
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
if revertTo.persisted != nil { if revertTo.persisted != nil {
s.rootPersisted = append(s.rootPersisted, revertTo.persisted) s.rootPersisted = append(s.rootPersisted, revertTo.persisted)
} }
newSnapshot.updateSize()
// swap in new snapshot // swap in new snapshot
rootPrev := s.root rootPrev := s.root
s.root = newSnapshot s.root = newSnapshot
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
// release lock // release lock
s.rootLock.Unlock() s.rootLock.Unlock()
@ -354,3 +519,12 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
return nil return nil
} }
func isMemorySegment(s *SegmentSnapshot) bool {
switch s.segment.(type) {
case *zap.SegmentBase:
return true
default:
return false
}
}

@ -15,9 +15,7 @@
package scorch package scorch
import ( import (
"bytes"
"encoding/json" "encoding/json"
"fmt" "fmt"
"os" "os"
"sync/atomic" "sync/atomic"
@ -40,16 +38,20 @@ func (s *Scorch) mergerLoop() {
OUTER: OUTER:
for { for {
atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1)
select { select {
case <-s.closeCh: case <-s.closeCh:
break OUTER break OUTER
default: default:
// check to see if there is a new snapshot to persist // check to see if there is a new snapshot to persist
s.rootLock.RLock() s.rootLock.Lock()
ourSnapshot := s.root ourSnapshot := s.root
ourSnapshot.AddRef() ourSnapshot.AddRef()
s.rootLock.RUnlock() atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size()))
atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch)
s.rootLock.Unlock()
if ourSnapshot.epoch != lastEpochMergePlanned { if ourSnapshot.epoch != lastEpochMergePlanned {
startTime := time.Now() startTime := time.Now()
@ -57,12 +59,21 @@ OUTER:
// lets get started // lets get started
err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions)
if err != nil { if err != nil {
atomic.StoreUint64(&s.iStats.mergeEpoch, 0)
if err == segment.ErrClosed {
// index has been closed
_ = ourSnapshot.DecRef()
break OUTER
}
s.fireAsyncError(fmt.Errorf("merging err: %v", err)) s.fireAsyncError(fmt.Errorf("merging err: %v", err))
_ = ourSnapshot.DecRef() _ = ourSnapshot.DecRef()
atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1)
continue OUTER continue OUTER
} }
lastEpochMergePlanned = ourSnapshot.epoch lastEpochMergePlanned = ourSnapshot.epoch
atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch)
s.fireEvent(EventKindMergerProgress, time.Since(startTime)) s.fireEvent(EventKindMergerProgress, time.Since(startTime))
} }
_ = ourSnapshot.DecRef() _ = ourSnapshot.DecRef()
@ -88,7 +99,10 @@ OUTER:
case <-ew.notifyCh: case <-ew.notifyCh:
} }
} }
atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1)
} }
s.asyncTasks.Done() s.asyncTasks.Done()
} }
@ -105,6 +119,11 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions,
if err != nil { if err != nil {
return &mergePlannerOptions, err return &mergePlannerOptions, err
} }
err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions)
if err != nil {
return nil, err
}
} }
return &mergePlannerOptions, nil return &mergePlannerOptions, nil
} }
@ -119,32 +138,45 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
} }
} }
atomic.AddUint64(&s.stats.TotFileMergePlan, 1)
// give this list to the planner // give this list to the planner
resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options) resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options)
if err != nil { if err != nil {
atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1)
return fmt.Errorf("merge planning err: %v", err) return fmt.Errorf("merge planning err: %v", err)
} }
if resultMergePlan == nil { if resultMergePlan == nil {
// nothing to do // nothing to do
atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1)
return nil return nil
} }
atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1)
atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks)))
// process tasks in serial for now // process tasks in serial for now
var notifications []chan *IndexSnapshot var notifications []chan *IndexSnapshot
for _, task := range resultMergePlan.Tasks { for _, task := range resultMergePlan.Tasks {
if len(task.Segments) == 0 { if len(task.Segments) == 0 {
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1)
continue continue
} }
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments)))
oldMap := make(map[uint64]*SegmentSnapshot) oldMap := make(map[uint64]*SegmentSnapshot)
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments)) segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments))
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
for _, planSegment := range task.Segments { for _, planSegment := range task.Segments {
if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
oldMap[segSnapshot.id] = segSnapshot oldMap[segSnapshot.id] = segSnapshot
if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok { if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok {
if segSnapshot.LiveSize() == 0 { if segSnapshot.LiveSize() == 0 {
atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1)
oldMap[segSnapshot.id] = nil oldMap[segSnapshot.id] = nil
} else { } else {
segmentsToMerge = append(segmentsToMerge, zapSeg) segmentsToMerge = append(segmentsToMerge, zapSeg)
@ -155,32 +187,53 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
} }
var oldNewDocNums map[uint64][]uint64 var oldNewDocNums map[uint64][]uint64
var segment segment.Segment var seg segment.Segment
if len(segmentsToMerge) > 0 { if len(segmentsToMerge) > 0 {
filename := zapFileName(newSegmentID) filename := zapFileName(newSegmentID)
s.markIneligibleForRemoval(filename) s.markIneligibleForRemoval(filename)
path := s.path + string(os.PathSeparator) + filename path := s.path + string(os.PathSeparator) + filename
newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024)
fileMergeZapStartTime := time.Now()
atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path,
DefaultChunkFactor, s.closeCh, s)
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime))
atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime)
if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime {
atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime)
}
if err != nil { if err != nil {
s.unmarkIneligibleForRemoval(filename) s.unmarkIneligibleForRemoval(filename)
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
if err == segment.ErrClosed {
return err
}
return fmt.Errorf("merging failed: %v", err) return fmt.Errorf("merging failed: %v", err)
} }
segment, err = zap.Open(path)
seg, err = zap.Open(path)
if err != nil { if err != nil {
s.unmarkIneligibleForRemoval(filename) s.unmarkIneligibleForRemoval(filename)
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
return err return err
} }
oldNewDocNums = make(map[uint64][]uint64) oldNewDocNums = make(map[uint64][]uint64)
for i, segNewDocNums := range newDocNums { for i, segNewDocNums := range newDocNums {
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
} }
atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge)))
} }
sm := &segmentMerge{ sm := &segmentMerge{
id: newSegmentID, id: newSegmentID,
old: oldMap, old: oldMap,
oldNewDocNums: oldNewDocNums, oldNewDocNums: oldNewDocNums,
new: segment, new: seg,
notify: make(chan *IndexSnapshot, 1), notify: make(chan *IndexSnapshot, 1),
} }
notifications = append(notifications, sm.notify) notifications = append(notifications, sm.notify)
@ -188,21 +241,28 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
// give it to the introducer // give it to the introducer
select { select {
case <-s.closeCh: case <-s.closeCh:
_ = segment.Close() _ = seg.Close()
return nil return segment.ErrClosed
case s.merges <- sm: case s.merges <- sm:
atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1)
} }
atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1)
} }
for _, notification := range notifications { for _, notification := range notifications {
select { select {
case <-s.closeCh: case <-s.closeCh:
return nil atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1)
return segment.ErrClosed
case newSnapshot := <-notification: case newSnapshot := <-notification:
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1)
if newSnapshot != nil { if newSnapshot != nil {
_ = newSnapshot.DecRef() _ = newSnapshot.DecRef()
} }
} }
} }
return nil return nil
} }
@ -219,44 +279,48 @@ type segmentMerge struct {
// into the root // into the root
func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int,
chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) { chunkFactor uint32) (*IndexSnapshot, uint64, error) {
var br bytes.Buffer atomic.AddUint64(&s.stats.TotMemMergeBeg, 1)
cr := zap.NewCountHashWriter(&br)
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, memMergeZapStartTime := time.Now()
docValueOffset, dictLocs, fieldsInv, fieldsMap, err :=
zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr)
if err != nil {
return 0, nil, 0, err
}
sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1)
fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset,
docValueOffset, dictLocs)
if err != nil {
return 0, nil, 0, err
}
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
filename := zapFileName(newSegmentID) filename := zapFileName(newSegmentID)
path := s.path + string(os.PathSeparator) + filename path := s.path + string(os.PathSeparator) + filename
err = zap.PersistSegmentBase(sb, path)
newDocNums, _, err :=
zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s)
atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1)
memMergeZapTime := uint64(time.Since(memMergeZapStartTime))
atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime)
if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime {
atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime)
}
if err != nil { if err != nil {
return 0, nil, 0, err atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
return nil, 0, err
} }
segment, err := zap.Open(path) seg, err := zap.Open(path)
if err != nil { if err != nil {
return 0, nil, 0, err atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
return nil, 0, err
} }
// update persisted stats
atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count())
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
sm := &segmentMerge{ sm := &segmentMerge{
id: newSegmentID, id: newSegmentID,
old: make(map[uint64]*SegmentSnapshot), old: make(map[uint64]*SegmentSnapshot),
oldNewDocNums: make(map[uint64][]uint64), oldNewDocNums: make(map[uint64][]uint64),
new: segment, new: seg,
notify: make(chan *IndexSnapshot, 1), notify: make(chan *IndexSnapshot, 1),
} }
@ -268,15 +332,21 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
select { // send to introducer select { // send to introducer
case <-s.closeCh: case <-s.closeCh:
_ = segment.DecRef() _ = seg.DecRef()
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? return nil, 0, segment.ErrClosed
case s.merges <- sm: case s.merges <- sm:
} }
select { // wait for introduction to complete select { // wait for introduction to complete
case <-s.closeCh: case <-s.closeCh:
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? return nil, 0, segment.ErrClosed
case newSnapshot := <-sm.notify: case newSnapshot := <-sm.notify:
return numDocs, newSnapshot, newSegmentID, nil atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs)))
atomic.AddUint64(&s.stats.TotMemMergeDone, 1)
return newSnapshot, newSegmentID, nil
} }
} }
func (s *Scorch) ReportBytesWritten(bytesWritten uint64) {
atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten)
}

@ -18,6 +18,7 @@
package mergeplan package mergeplan
import ( import (
"errors"
"fmt" "fmt"
"math" "math"
"sort" "sort"
@ -115,7 +116,15 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 {
return o.FloorSegmentSize return o.FloorSegmentSize
} }
// Suggested default options. // MaxSegmentSizeLimit represents the maximum size of a segment,
// this limit comes with hit-1 optimisation/max encoding limit uint31.
const MaxSegmentSizeLimit = 1<<31 - 1
// ErrMaxSegmentSizeTooLarge is returned when the size of the segment
// exceeds the MaxSegmentSizeLimit
var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit")
// DefaultMergePlanOptions suggests the default options.
var DefaultMergePlanOptions = MergePlanOptions{ var DefaultMergePlanOptions = MergePlanOptions{
MaxSegmentsPerTier: 10, MaxSegmentsPerTier: 10,
MaxSegmentSize: 5000000, MaxSegmentSize: 5000000,
@ -208,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
if len(roster) > 0 { if len(roster) > 0 {
rosterScore := scoreSegments(roster, o) rosterScore := scoreSegments(roster, o)
if len(bestRoster) <= 0 || rosterScore < bestRosterScore { if len(bestRoster) == 0 || rosterScore < bestRosterScore {
bestRoster = roster bestRoster = roster
bestRosterScore = rosterScore bestRosterScore = rosterScore
} }
} }
} }
if len(bestRoster) <= 0 { if len(bestRoster) == 0 {
return rv, nil return rv, nil
} }
@ -367,3 +376,11 @@ func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan)
return strings.Join(rv, "\n") return strings.Join(rv, "\n")
} }
// ValidateMergePlannerOptions validates the merge planner options
func ValidateMergePlannerOptions(options *MergePlanOptions) error {
if options.MaxSegmentSize > MaxSegmentSizeLimit {
return ErrMaxSegmentSizeTooLarge
}
return nil
}

@ -0,0 +1,420 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"fmt"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
)
var OptimizeConjunction = true
var OptimizeConjunctionUnadorned = true
var OptimizeDisjunctionUnadorned = true
func (s *IndexSnapshotTermFieldReader) Optimize(kind string,
octx index.OptimizableContext) (index.OptimizableContext, error) {
if OptimizeConjunction && kind == "conjunction" {
return s.optimizeConjunction(octx)
}
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" {
return s.optimizeConjunctionUnadorned(octx)
}
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" {
return s.optimizeDisjunctionUnadorned(octx)
}
return octx, nil
}
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256)
// ----------------------------------------------------------------
func (s *IndexSnapshotTermFieldReader) optimizeConjunction(
octx index.OptimizableContext) (index.OptimizableContext, error) {
if octx == nil {
octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
}
o, ok := octx.(*OptimizeTFRConjunction)
if !ok {
return octx, nil
}
if o.snapshot != s.snapshot {
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots")
}
o.tfrs = append(o.tfrs, s)
return o, nil
}
type OptimizeTFRConjunction struct {
snapshot *IndexSnapshot
tfrs []*IndexSnapshotTermFieldReader
}
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) {
if len(o.tfrs) <= 1 {
return nil, nil
}
for i := range o.snapshot.segment {
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator)
if !ok || itr0.ActualBM == nil {
continue
}
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator)
if !ok || itr1.ActualBM == nil {
continue
}
bm := roaring.And(itr0.ActualBM, itr1.ActualBM)
for _, tfr := range o.tfrs[2:] {
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if !ok || itr.ActualBM == nil {
continue
}
bm.And(itr.ActualBM)
}
// in this conjunction optimization, the postings iterators
// will all share the same AND'ed together actual bitmap. The
// regular conjunction searcher machinery will still be used,
// but the underlying bitmap will be smaller.
for _, tfr := range o.tfrs {
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if ok && itr.ActualBM != nil {
itr.ActualBM = bm
itr.Actual = bm.Iterator()
}
}
}
return nil, nil
}
// ----------------------------------------------------------------
// An "unadorned" conjunction optimization is appropriate when
// additional or subsidiary information like freq-norm's and
// term-vectors are not required, and instead only the internal-id's
// are needed.
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned(
octx index.OptimizableContext) (index.OptimizableContext, error) {
if octx == nil {
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot}
}
o, ok := octx.(*OptimizeTFRConjunctionUnadorned)
if !ok {
return nil, nil
}
if o.snapshot != s.snapshot {
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots")
}
o.tfrs = append(o.tfrs, s)
return o, nil
}
type OptimizeTFRConjunctionUnadorned struct {
snapshot *IndexSnapshot
tfrs []*IndexSnapshotTermFieldReader
}
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>")
var OptimizeTFRConjunctionUnadornedField = "*"
// Finish of an unadorned conjunction optimization will compute a
// termFieldReader with an "actual" bitmap that represents the
// constituent bitmaps AND'ed together. This termFieldReader cannot
// provide any freq-norm or termVector associated information.
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) {
if len(o.tfrs) <= 1 {
return nil, nil
}
// We use an artificial term and field because the optimized
// termFieldReader can represent multiple terms and fields.
oTFR := &IndexSnapshotTermFieldReader{
term: OptimizeTFRConjunctionUnadornedTerm,
field: OptimizeTFRConjunctionUnadornedField,
snapshot: o.snapshot,
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
segmentOffset: 0,
includeFreq: false,
includeNorm: false,
includeTermVectors: false,
}
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
OUTER:
for i := range o.snapshot.segment {
actualBMs = actualBMs[:0]
var docNum1HitLast uint64
var docNum1HitLastOk bool
for _, tfr := range o.tfrs {
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok {
// An empty postings iterator means the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if !ok {
// We optimize zap postings iterators only.
return nil, nil
}
// If the postings iterator is "1-hit" optimized, then we
// can perform several optimizations up-front here.
docNum1Hit, ok := itr.DocNum1Hit()
if ok {
if docNum1Hit == zap.DocNum1HitFinished {
// An empty docNum here means the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
if docNum1HitLastOk && docNum1HitLast != docNum1Hit {
// The docNum1Hit doesn't match the previous
// docNum1HitLast, so the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
docNum1HitLast = docNum1Hit
docNum1HitLastOk = true
continue
}
if itr.ActualBM == nil {
// An empty actual bitmap means the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
// Collect the actual bitmap for more processing later.
actualBMs = append(actualBMs, itr.ActualBM)
}
if docNum1HitLastOk {
// We reach here if all the 1-hit optimized posting
// iterators had the same 1-hit docNum, so we can check if
// our collected actual bitmaps also have that docNum.
for _, bm := range actualBMs {
if !bm.Contains(uint32(docNum1HitLast)) {
// The docNum1Hit isn't in one of our actual
// bitmaps, so the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
}
// The actual bitmaps and docNum1Hits all contain or have
// the same 1-hit docNum, so that's our AND'ed result.
oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit(
docNum1HitLast, zap.NormBits1Hit, false, false)
if err != nil {
return nil, nil
}
continue OUTER
}
if len(actualBMs) == 0 {
// If we've collected no actual bitmaps at this point,
// then the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
if len(actualBMs) == 1 {
// If we've only 1 actual bitmap, then that's our result.
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
actualBMs[0], false, false)
if err != nil {
return nil, nil
}
continue OUTER
}
// Else, AND together our collected bitmaps as our result.
bm := roaring.And(actualBMs[0], actualBMs[1])
for _, actualBM := range actualBMs[2:] {
bm.And(actualBM)
}
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
bm, false, false)
if err != nil {
return nil, nil
}
}
return oTFR, nil
}
// ----------------------------------------------------------------
// An "unadorned" disjunction optimization is appropriate when
// additional or subsidiary information like freq-norm's and
// term-vectors are not required, and instead only the internal-id's
// are needed.
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned(
octx index.OptimizableContext) (index.OptimizableContext, error) {
if octx == nil {
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot}
}
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned)
if !ok {
return nil, nil
}
if o.snapshot != s.snapshot {
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots")
}
o.tfrs = append(o.tfrs, s)
return o, nil
}
type OptimizeTFRDisjunctionUnadorned struct {
snapshot *IndexSnapshot
tfrs []*IndexSnapshotTermFieldReader
}
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>")
var OptimizeTFRDisjunctionUnadornedField = "*"
// Finish of an unadorned disjunction optimization will compute a
// termFieldReader with an "actual" bitmap that represents the
// constituent bitmaps OR'ed together. This termFieldReader cannot
// provide any freq-norm or termVector associated information.
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) {
if len(o.tfrs) <= 1 {
return nil, nil
}
for i := range o.snapshot.segment {
var cMax uint64
for _, tfr := range o.tfrs {
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if !ok {
return nil, nil
}
if itr.ActualBM != nil {
c := itr.ActualBM.GetCardinality()
if cMax < c {
cMax = c
}
}
}
// Heuristic to skip the optimization if all the constituent
// bitmaps are too small, where the processing & resource
// overhead to create the OR'ed bitmap outweighs the benefit.
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality {
return nil, nil
}
}
// We use an artificial term and field because the optimized
// termFieldReader can represent multiple terms and fields.
oTFR := &IndexSnapshotTermFieldReader{
term: OptimizeTFRDisjunctionUnadornedTerm,
field: OptimizeTFRDisjunctionUnadornedField,
snapshot: o.snapshot,
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
segmentOffset: 0,
includeFreq: false,
includeNorm: false,
includeTermVectors: false,
}
var docNums []uint32 // Collected docNum's from 1-hit posting lists.
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
for i := range o.snapshot.segment {
docNums = docNums[:0]
actualBMs = actualBMs[:0]
for _, tfr := range o.tfrs {
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if !ok {
return nil, nil
}
docNum, ok := itr.DocNum1Hit()
if ok {
docNums = append(docNums, uint32(docNum))
continue
}
if itr.ActualBM != nil {
actualBMs = append(actualBMs, itr.ActualBM)
}
}
var bm *roaring.Bitmap
if len(actualBMs) > 2 {
bm = roaring.HeapOr(actualBMs...)
} else if len(actualBMs) == 2 {
bm = roaring.Or(actualBMs[0], actualBMs[1])
} else if len(actualBMs) == 1 {
bm = actualBMs[0].Clone()
}
if bm == nil {
bm = roaring.New()
}
bm.AddMany(docNums)
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false)
if err != nil {
return nil, nil
}
}
return oTFR, nil
}

@ -16,9 +16,12 @@ package scorch
import ( import (
"bytes" "bytes"
"encoding/binary"
"encoding/json"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"log" "log"
"math"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
@ -27,23 +30,57 @@ import (
"time" "time"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/boltdb/bolt" bolt "github.com/etcd-io/bbolt"
) )
var DefaultChunkFactor uint32 = 1024 var DefaultChunkFactor uint32 = 1024
// Arbitrary number, need to make it configurable. // DefaultPersisterNapTimeMSec is kept to zero as this helps in direct
// Lower values like 10/making persister really slow // persistence of segments with the default safe batch option.
// doesn't work well as it is creating more files to // If the default safe batch option results in high number of
// persist for in next persist iteration and spikes the # FDs. // files on disk, then users may initialise this configuration parameter
// Ideal value should let persister also proceed at // with higher values so that the persister will nap a bit within it's
// an optimum pace so that the merger can skip // work loop to favour better in-memory merging of segments to result
// many intermediate snapshots. // in fewer segment files on disk. But that may come with an indexing
// This needs to be based on empirical data. // performance overhead.
// TODO - may need to revisit this approach/value. // Unsafe batch users are advised to override this to higher value
var epochDistance = uint64(5) // for better performance especially with high data density.
var DefaultPersisterNapTimeMSec int = 0 // ms
// DefaultPersisterNapUnderNumFiles helps in controlling the pace of
// persister. At times of a slow merger progress with heavy file merging
// operations, its better to pace down the persister for letting the merger
// to catch up within a range defined by this parameter.
// Fewer files on disk (as per the merge plan) would result in keeping the
// file handle usage under limit, faster disk merger and a healthier index.
// Its been observed that such a loosely sync'ed introducer-persister-merger
// trio results in better overall performance.
var DefaultPersisterNapUnderNumFiles int = 1000
var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64
type persisterOptions struct {
// PersisterNapTimeMSec controls the wait/delay injected into
// persistence workloop to improve the chances for
// a healthier and heavier in-memory merging
PersisterNapTimeMSec int
// PersisterNapTimeMSec > 0, and the number of files is less than
// PersisterNapUnderNumFiles, then the persister will sleep
// PersisterNapTimeMSec amount of time to improve the chances for
// a healthier and heavier in-memory merging
PersisterNapUnderNumFiles int
// MemoryPressurePauseThreshold let persister to have a better leeway
// for prudently performing the memory merge of segments on a memory
// pressure situation. Here the config value is an upper threshold
// for the number of paused application threads. The default value would
// be a very high number to always favour the merging of memory segments.
MemoryPressurePauseThreshold uint64
}
type notificationChan chan struct{} type notificationChan chan struct{}
@ -53,8 +90,17 @@ func (s *Scorch) persisterLoop() {
var persistWatchers []*epochWatcher var persistWatchers []*epochWatcher
var lastPersistedEpoch, lastMergedEpoch uint64 var lastPersistedEpoch, lastMergedEpoch uint64
var ew *epochWatcher var ew *epochWatcher
po, err := s.parsePersisterOptions()
if err != nil {
s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err))
s.asyncTasks.Done()
return
}
OUTER: OUTER:
for { for {
atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1)
select { select {
case <-s.closeCh: case <-s.closeCh:
break OUTER break OUTER
@ -65,11 +111,13 @@ OUTER:
if ew != nil && ew.epoch > lastMergedEpoch { if ew != nil && ew.epoch > lastMergedEpoch {
lastMergedEpoch = ew.epoch lastMergedEpoch = ew.epoch
} }
persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
&lastMergedEpoch, persistWatchers) lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
lastMergedEpoch, persistWatchers, po)
var ourSnapshot *IndexSnapshot var ourSnapshot *IndexSnapshot
var ourPersisted []chan error var ourPersisted []chan error
var ourPersistedCallbacks []index.BatchCallback
// check to see if there is a new snapshot to persist // check to see if there is a new snapshot to persist
s.rootLock.Lock() s.rootLock.Lock()
@ -78,13 +126,17 @@ OUTER:
ourSnapshot.AddRef() ourSnapshot.AddRef()
ourPersisted = s.rootPersisted ourPersisted = s.rootPersisted
s.rootPersisted = nil s.rootPersisted = nil
ourPersistedCallbacks = s.persistedCallbacks
s.persistedCallbacks = nil
atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size()))
atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch)
} }
s.rootLock.Unlock() s.rootLock.Unlock()
if ourSnapshot != nil { if ourSnapshot != nil {
startTime := time.Now() startTime := time.Now()
err := s.persistSnapshot(ourSnapshot) err := s.persistSnapshot(ourSnapshot, po)
for _, ch := range ourPersisted { for _, ch := range ourPersisted {
if err != nil { if err != nil {
ch <- err ch <- err
@ -92,10 +144,22 @@ OUTER:
close(ch) close(ch)
} }
if err != nil { if err != nil {
atomic.StoreUint64(&s.iStats.persistEpoch, 0)
if err == segment.ErrClosed {
// index has been closed
_ = ourSnapshot.DecRef()
break OUTER
}
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
_ = ourSnapshot.DecRef() _ = ourSnapshot.DecRef()
atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
continue OUTER continue OUTER
} }
for i := range ourPersistedCallbacks {
ourPersistedCallbacks[i](err)
}
atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch)
lastPersistedEpoch = ourSnapshot.epoch lastPersistedEpoch = ourSnapshot.epoch
for _, ew := range persistWatchers { for _, ew := range persistWatchers {
@ -115,6 +179,8 @@ OUTER:
s.fireEvent(EventKindPersisterProgress, time.Since(startTime)) s.fireEvent(EventKindPersisterProgress, time.Since(startTime))
if changed { if changed {
s.removeOldData()
atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1)
continue OUTER continue OUTER
} }
} }
@ -133,17 +199,21 @@ OUTER:
s.removeOldData() // might as well cleanup while waiting s.removeOldData() // might as well cleanup while waiting
atomic.AddUint64(&s.stats.TotPersistLoopWait, 1)
select { select {
case <-s.closeCh: case <-s.closeCh:
break OUTER break OUTER
case <-w.notifyCh: case <-w.notifyCh:
// woken up, next loop should pick up work // woken up, next loop should pick up work
continue OUTER atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1)
case ew = <-s.persisterNotifier: case ew = <-s.persisterNotifier:
// if the watchers are already caught up then let them wait, // if the watchers are already caught up then let them wait,
// else let them continue to do the catch up // else let them continue to do the catch up
persistWatchers = append(persistWatchers, ew) persistWatchers = append(persistWatchers, ew)
} }
atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1)
} }
} }
@ -160,32 +230,88 @@ func notifyMergeWatchers(lastPersistedEpoch uint64,
return watchersNext return watchersNext
} }
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64, func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64,
persistWatchers []*epochWatcher) []*epochWatcher { persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) {
// first, let the watchers proceed if they lag behind // first, let the watchers proceed if they lag behind
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
// check the merger lag by counting the segment files on disk,
// On finding fewer files on disk, persister takes a short pause
// for sufficient in-memory segments to pile up for the next
// memory merge cum persist loop.
// On finding too many files on disk, persister pause until the merger
// catches up to reduce the segment file count under the threshold.
// But if there is memory pressure, then skip this sleep maneuvers.
numFilesOnDisk, _ := s.diskFileStats()
if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) &&
po.PersisterNapTimeMSec > 0 && s.paused() == 0 {
select {
case <-s.closeCh:
case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)):
atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1)
case ew := <-s.persisterNotifier:
// unblock the merger in meantime
persistWatchers = append(persistWatchers, ew)
lastMergedEpoch = ew.epoch
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1)
}
return lastMergedEpoch, persistWatchers
}
OUTER: OUTER:
// check for slow merger and await until the merger catch up for po.PersisterNapUnderNumFiles > 0 &&
for lastPersistedEpoch > *lastMergedEpoch+epochDistance { numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) &&
lastMergedEpoch < lastPersistedEpoch {
atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1)
select { select {
case <-s.closeCh: case <-s.closeCh:
break OUTER break OUTER
case ew := <-s.persisterNotifier: case ew := <-s.persisterNotifier:
persistWatchers = append(persistWatchers, ew) persistWatchers = append(persistWatchers, ew)
*lastMergedEpoch = ew.epoch lastMergedEpoch = ew.epoch
} }
atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1)
// let the watchers proceed if they lag behind // let the watchers proceed if they lag behind
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
numFilesOnDisk, _ = s.diskFileStats()
} }
return persistWatchers return lastMergedEpoch, persistWatchers
} }
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) {
po := persisterOptions{
PersisterNapTimeMSec: DefaultPersisterNapTimeMSec,
PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles,
MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold,
}
if v, ok := s.config["scorchPersisterOptions"]; ok {
b, err := json.Marshal(v)
if err != nil {
return &po, err
}
err = json.Unmarshal(b, &po)
if err != nil {
return &po, err
}
}
return &po, nil
}
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot,
po *persisterOptions) error {
// Perform in-memory segment merging only when the memory pressure is
// below the configured threshold, else the persister performs the
// direct persistence of segments.
if s.paused() < po.MemoryPressurePauseThreshold {
persisted, err := s.persistSnapshotMaybeMerge(snapshot) persisted, err := s.persistSnapshotMaybeMerge(snapshot)
if err != nil { if err != nil {
return err return err
@ -193,6 +319,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
if persisted { if persisted {
return nil return nil
} }
}
return s.persistSnapshotDirect(snapshot) return s.persistSnapshotDirect(snapshot)
} }
@ -224,7 +351,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
return false, nil return false, nil
} }
_, newSnapshot, newSegmentID, err := s.mergeSegmentBases( newSnapshot, newSegmentID, err := s.mergeSegmentBases(
snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor) snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor)
if err != nil { if err != nil {
return false, err return false, err
@ -249,6 +376,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)), segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)),
internal: snapshot.internal, internal: snapshot.internal,
epoch: snapshot.epoch, epoch: snapshot.epoch,
creator: "persistSnapshotMaybeMerge",
} }
// copy to the equiv the segments that weren't replaced // copy to the equiv the segments that weren't replaced
@ -301,6 +429,22 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
return err return err
} }
// persist meta values
metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey)
if err != nil {
return err
}
err = metaBucket.Put([]byte("type"), []byte(zap.Type))
if err != nil {
return err
}
buf := make([]byte, binary.MaxVarintLen32)
binary.BigEndian.PutUint32(buf, zap.Version)
err = metaBucket.Put([]byte("version"), buf)
if err != nil {
return err
}
// persist internal values // persist internal values
internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey) internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey)
if err != nil { if err != nil {
@ -390,44 +534,21 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
} }
} }
s.rootLock.Lock() persist := &persistIntroduction{
newIndexSnapshot := &IndexSnapshot{ persisted: newSegments,
parent: s, applied: make(notificationChan),
epoch: s.nextSnapshotEpoch,
segment: make([]*SegmentSnapshot, len(s.root.segment)),
offsets: make([]uint64, len(s.root.offsets)),
internal: make(map[string][]byte, len(s.root.internal)),
refs: 1,
}
s.nextSnapshotEpoch++
for i, segmentSnapshot := range s.root.segment {
// see if this segment has been replaced
if replacement, ok := newSegments[segmentSnapshot.id]; ok {
newSegmentSnapshot := &SegmentSnapshot{
id: segmentSnapshot.id,
segment: replacement,
deleted: segmentSnapshot.deleted,
cachedDocs: segmentSnapshot.cachedDocs,
}
newIndexSnapshot.segment[i] = newSegmentSnapshot
delete(newSegments, segmentSnapshot.id)
// update items persisted incase of a new segment snapshot
atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count())
} else {
newIndexSnapshot.segment[i] = s.root.segment[i]
newIndexSnapshot.segment[i].segment.AddRef()
}
newIndexSnapshot.offsets[i] = s.root.offsets[i]
} }
for k, v := range s.root.internal {
newIndexSnapshot.internal[k] = v select {
case <-s.closeCh:
return segment.ErrClosed
case s.persists <- persist:
} }
rootPrev := s.root select {
s.root = newIndexSnapshot case <-s.closeCh:
s.rootLock.Unlock() return segment.ErrClosed
if rootPrev != nil { case <-persist.applied:
_ = rootPrev.DecRef()
} }
} }
@ -462,6 +583,7 @@ var boltSnapshotsBucket = []byte{'s'}
var boltPathKey = []byte{'p'} var boltPathKey = []byte{'p'}
var boltDeletedKey = []byte{'d'} var boltDeletedKey = []byte{'d'}
var boltInternalKey = []byte{'i'} var boltInternalKey = []byte{'i'}
var boltMetaDataKey = []byte{'m'}
func (s *Scorch) loadFromBolt() error { func (s *Scorch) loadFromBolt() error {
return s.rootBolt.View(func(tx *bolt.Tx) error { return s.rootBolt.View(func(tx *bolt.Tx) error {
@ -478,19 +600,19 @@ func (s *Scorch) loadFromBolt() error {
continue continue
} }
if foundRoot { if foundRoot {
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) s.AddEligibleForRemoval(snapshotEpoch)
continue continue
} }
snapshot := snapshots.Bucket(k) snapshot := snapshots.Bucket(k)
if snapshot == nil { if snapshot == nil {
log.Printf("snapshot key, but bucket missing %x, continuing", k) log.Printf("snapshot key, but bucket missing %x, continuing", k)
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) s.AddEligibleForRemoval(snapshotEpoch)
continue continue
} }
indexSnapshot, err := s.loadSnapshot(snapshot) indexSnapshot, err := s.loadSnapshot(snapshot)
if err != nil { if err != nil {
log.Printf("unable to load snapshot, %v, continuing", err) log.Printf("unable to load snapshot, %v, continuing", err)
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) s.AddEligibleForRemoval(snapshotEpoch)
continue continue
} }
indexSnapshot.epoch = snapshotEpoch indexSnapshot.epoch = snapshotEpoch
@ -500,13 +622,16 @@ func (s *Scorch) loadFromBolt() error {
return err return err
} }
s.nextSegmentID++ s.nextSegmentID++
s.nextSnapshotEpoch = snapshotEpoch + 1
s.rootLock.Lock() s.rootLock.Lock()
if s.root != nil { s.nextSnapshotEpoch = snapshotEpoch + 1
_ = s.root.DecRef() rootPrev := s.root
}
s.root = indexSnapshot s.root = indexSnapshot
s.rootLock.Unlock() s.rootLock.Unlock()
if rootPrev != nil {
_ = rootPrev.DecRef()
}
foundRoot = true foundRoot = true
} }
return nil return nil
@ -524,7 +649,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
snapshotKey := segment.EncodeUvarintAscending(nil, epoch) snapshotKey := segment.EncodeUvarintAscending(nil, epoch)
snapshot := snapshots.Bucket(snapshotKey) snapshot := snapshots.Bucket(snapshotKey)
if snapshot == nil { if snapshot == nil {
return nil return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch)
} }
rv, err = s.loadSnapshot(snapshot) rv, err = s.loadSnapshot(snapshot)
return err return err
@ -536,12 +661,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
} }
func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
rv := &IndexSnapshot{ rv := &IndexSnapshot{
parent: s, parent: s,
internal: make(map[string][]byte), internal: make(map[string][]byte),
refs: 1, refs: 1,
creator: "loadSnapshot",
} }
var running uint64 var running uint64
c := snapshot.Cursor() c := snapshot.Cursor()
for k, _ := c.First(); k != nil; k, _ = c.Next() { for k, _ := c.First(); k != nil; k, _ = c.Next() {
@ -556,7 +682,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
_ = rv.DecRef() _ = rv.DecRef()
return nil, err return nil, err
} }
} else { } else if k[0] != boltMetaDataKey[0] {
segmentBucket := snapshot.Bucket(k) segmentBucket := snapshot.Bucket(k)
if segmentBucket == nil { if segmentBucket == nil {
_ = rv.DecRef() _ = rv.DecRef()
@ -577,6 +703,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
running += segmentSnapshot.segment.Count() running += segmentSnapshot.segment.Count()
} }
} }
return rv, nil return rv, nil
} }
@ -604,8 +731,10 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
_ = segment.Close() _ = segment.Close()
return nil, fmt.Errorf("error reading deleted bytes: %v", err) return nil, fmt.Errorf("error reading deleted bytes: %v", err)
} }
if !deletedBitmap.IsEmpty() {
rv.deleted = deletedBitmap rv.deleted = deletedBitmap
} }
}
return rv, nil return rv, nil
} }
@ -643,14 +772,14 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
return 0, err return 0, err
} }
if len(persistedEpochs) <= NumSnapshotsToKeep { if len(persistedEpochs) <= s.numSnapshotsToKeep {
// we need to keep everything // we need to keep everything
return 0, nil return 0, nil
} }
// make a map of epochs to protect from deletion // make a map of epochs to protect from deletion
protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep) protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep)
for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] { for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] {
protectedEpochs[epoch] = struct{}{} protectedEpochs[epoch] = struct{}{}
} }
@ -668,7 +797,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
s.eligibleForRemoval = newEligible s.eligibleForRemoval = newEligible
s.rootLock.Unlock() s.rootLock.Unlock()
if len(epochsToRemove) <= 0 { if len(epochsToRemove) == 0 {
return 0, nil return 0, nil
} }

@ -1,110 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
)
type Reader struct {
root *IndexSnapshot // Owns 1 ref-count on the index snapshot.
}
func (r *Reader) TermFieldReader(term []byte, field string, includeFreq,
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors)
}
// DocIDReader returns an iterator over all doc ids
// The caller must close returned instance to release associated resources.
func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) {
return r.root.DocIDReaderAll()
}
func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {
return r.root.DocIDReaderOnly(ids)
}
func (r *Reader) FieldDict(field string) (index.FieldDict, error) {
return r.root.FieldDict(field)
}
// FieldDictRange is currently defined to include the start and end terms
func (r *Reader) FieldDictRange(field string, startTerm []byte,
endTerm []byte) (index.FieldDict, error) {
return r.root.FieldDictRange(field, startTerm, endTerm)
}
func (r *Reader) FieldDictPrefix(field string,
termPrefix []byte) (index.FieldDict, error) {
return r.root.FieldDictPrefix(field, termPrefix)
}
func (r *Reader) Document(id string) (*document.Document, error) {
return r.root.Document(id)
}
func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string,
visitor index.DocumentFieldTermVisitor) error {
return r.root.DocumentVisitFieldTerms(id, fields, visitor)
}
func (r *Reader) Fields() ([]string, error) {
return r.root.Fields()
}
func (r *Reader) GetInternal(key []byte) ([]byte, error) {
return r.root.GetInternal(key)
}
func (r *Reader) DocCount() (uint64, error) {
return r.root.DocCount()
}
func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) {
return r.root.ExternalID(id)
}
func (r *Reader) InternalID(id string) (index.IndexInternalID, error) {
return r.root.InternalID(id)
}
func (r *Reader) DumpAll() chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}
func (r *Reader) DumpDoc(id string) chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}
func (r *Reader) DumpFields() chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}
func (r *Reader) Close() error {
return r.root.DecRef()
}

@ -17,6 +17,7 @@ package scorch
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io/ioutil"
"os" "os"
"sync" "sync"
"sync/atomic" "sync/atomic"
@ -27,23 +28,24 @@ import (
"github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/mem"
"github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
"github.com/boltdb/bolt" bolt "github.com/etcd-io/bbolt"
) )
const Name = "scorch" const Name = "scorch"
const Version uint8 = 1 const Version uint8 = 2
var ErrClosed = fmt.Errorf("scorch closed")
type Scorch struct { type Scorch struct {
readOnly bool readOnly bool
version uint8 version uint8
config map[string]interface{} config map[string]interface{}
analysisQueue *index.AnalysisQueue analysisQueue *index.AnalysisQueue
stats *Stats stats Stats
nextSegmentID uint64 nextSegmentID uint64
path string path string
@ -52,12 +54,15 @@ type Scorch struct {
rootLock sync.RWMutex rootLock sync.RWMutex
root *IndexSnapshot // holds 1 ref-count on the root root *IndexSnapshot // holds 1 ref-count on the root
rootPersisted []chan error // closed when root is persisted rootPersisted []chan error // closed when root is persisted
persistedCallbacks []index.BatchCallback
nextSnapshotEpoch uint64 nextSnapshotEpoch uint64
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
numSnapshotsToKeep int
closeCh chan struct{} closeCh chan struct{}
introductions chan *segmentIntroduction introductions chan *segmentIntroduction
persists chan *persistIntroduction
merges chan *segmentMerge merges chan *segmentMerge
introducerNotifier chan *epochWatcher introducerNotifier chan *epochWatcher
revertToSnapshots chan *snapshotReversion revertToSnapshots chan *snapshotReversion
@ -67,6 +72,23 @@ type Scorch struct {
onEvent func(event Event) onEvent func(event Event)
onAsyncError func(err error) onAsyncError func(err error)
iStats internalStats
pauseLock sync.RWMutex
pauseCount uint64
}
type internalStats struct {
persistEpoch uint64
persistSnapshotSize uint64
mergeEpoch uint64
mergeSnapshotSize uint64
newSegBufBytesAdded uint64
newSegBufBytesRemoved uint64
analysisBytesAdded uint64
analysisBytesRemoved uint64
} }
func NewScorch(storeName string, func NewScorch(storeName string,
@ -80,8 +102,7 @@ func NewScorch(storeName string,
closeCh: make(chan struct{}), closeCh: make(chan struct{}),
ineligibleForRemoval: map[string]bool{}, ineligibleForRemoval: map[string]bool{},
} }
rv.stats = &Stats{i: rv} rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"}
rv.root = &IndexSnapshot{parent: rv, refs: 1}
ro, ok := config["read_only"].(bool) ro, ok := config["read_only"].(bool)
if ok { if ok {
rv.readOnly = ro rv.readOnly = ro
@ -101,9 +122,30 @@ func NewScorch(storeName string,
return rv, nil return rv, nil
} }
func (s *Scorch) paused() uint64 {
s.pauseLock.Lock()
pc := s.pauseCount
s.pauseLock.Unlock()
return pc
}
func (s *Scorch) incrPause() {
s.pauseLock.Lock()
s.pauseCount++
s.pauseLock.Unlock()
}
func (s *Scorch) decrPause() {
s.pauseLock.Lock()
s.pauseCount--
s.pauseLock.Unlock()
}
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
if s.onEvent != nil { if s.onEvent != nil {
s.incrPause()
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
s.decrPause()
} }
} }
@ -111,6 +153,7 @@ func (s *Scorch) fireAsyncError(err error) {
if s.onAsyncError != nil { if s.onAsyncError != nil {
s.onAsyncError(err) s.onAsyncError(err)
} }
atomic.AddUint64(&s.stats.TotOnErrors, 1)
} }
func (s *Scorch) Open() error { func (s *Scorch) Open() error {
@ -172,7 +215,10 @@ func (s *Scorch) openBolt() error {
} }
} }
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment)))
s.introductions = make(chan *segmentIntroduction) s.introductions = make(chan *segmentIntroduction)
s.persists = make(chan *persistIntroduction)
s.merges = make(chan *segmentMerge) s.merges = make(chan *segmentMerge)
s.introducerNotifier = make(chan *epochWatcher, 1) s.introducerNotifier = make(chan *epochWatcher, 1)
s.revertToSnapshots = make(chan *snapshotReversion) s.revertToSnapshots = make(chan *snapshotReversion)
@ -186,6 +232,17 @@ func (s *Scorch) openBolt() error {
} }
} }
s.numSnapshotsToKeep = NumSnapshotsToKeep
if v, ok := s.config["numSnapshotsToKeep"]; ok {
var t int
if t, err = parseToInteger(v); err != nil {
return fmt.Errorf("numSnapshotsToKeep parse err: %v", err)
}
if t > 0 {
s.numSnapshotsToKeep = t
}
}
return nil return nil
} }
@ -255,6 +312,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
// FIXME could sort ids list concurrent with analysis? // FIXME could sort ids list concurrent with analysis?
if len(batch.IndexOps) > 0 {
go func() { go func() {
for _, doc := range batch.IndexOps { for _, doc := range batch.IndexOps {
if doc != nil { if doc != nil {
@ -264,47 +322,63 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
} }
} }
}() }()
}
// wait for analysis result // wait for analysis result
analysisResults := make([]*index.AnalysisResult, int(numUpdates)) analysisResults := make([]*index.AnalysisResult, int(numUpdates))
var itemsDeQueued uint64 var itemsDeQueued uint64
var totalAnalysisSize int
for itemsDeQueued < numUpdates { for itemsDeQueued < numUpdates {
result := <-resultChan result := <-resultChan
resultSize := result.Size()
atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize))
totalAnalysisSize += resultSize
analysisResults[itemsDeQueued] = result analysisResults[itemsDeQueued] = result
itemsDeQueued++ itemsDeQueued++
} }
close(resultChan) close(resultChan)
defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize))
atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start))) atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start)))
indexStart := time.Now()
// notify handlers that we're about to introduce a segment // notify handlers that we're about to introduce a segment
s.fireEvent(EventKindBatchIntroductionStart, 0) s.fireEvent(EventKindBatchIntroductionStart, 0)
var newSegment segment.Segment var newSegment segment.Segment
var bufBytes uint64
if len(analysisResults) > 0 { if len(analysisResults) > 0 {
newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor) newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor)
if err != nil { if err != nil {
return err return err
} }
atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes)
} else {
atomic.AddUint64(&s.stats.TotBatchesEmpty, 1)
} }
err = s.prepareSegment(newSegment, ids, batch.InternalOps) err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback())
if err != nil { if err != nil {
if newSegment != nil { if newSegment != nil {
_ = newSegment.Close() _ = newSegment.Close()
} }
atomic.AddUint64(&s.stats.errors, 1) atomic.AddUint64(&s.stats.TotOnErrors, 1)
} else { } else {
atomic.AddUint64(&s.stats.updates, numUpdates) atomic.AddUint64(&s.stats.TotUpdates, numUpdates)
atomic.AddUint64(&s.stats.deletes, numDeletes) atomic.AddUint64(&s.stats.TotDeletes, numDeletes)
atomic.AddUint64(&s.stats.batches, 1) atomic.AddUint64(&s.stats.TotBatches, 1)
atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes) atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes)
} }
atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes)
atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart)))
return err return err
} }
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
internalOps map[string][]byte) error { internalOps map[string][]byte, persistedCallback index.BatchCallback) error {
// new introduction // new introduction
introduction := &segmentIntroduction{ introduction := &segmentIntroduction{
@ -314,6 +388,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
obsoletes: make(map[uint64]*roaring.Bitmap), obsoletes: make(map[uint64]*roaring.Bitmap),
internal: internalOps, internal: internalOps,
applied: make(chan error), applied: make(chan error),
persistedCallback: persistedCallback,
} }
if !s.unsafeBatch { if !s.unsafeBatch {
@ -326,6 +401,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
root.AddRef() root.AddRef()
s.rootLock.RUnlock() s.rootLock.RUnlock()
defer func() { _ = root.DecRef() }()
for _, seg := range root.segment { for _, seg := range root.segment {
delta, err := seg.segment.DocNumbers(ids) delta, err := seg.segment.DocNumbers(ids)
if err != nil { if err != nil {
@ -334,7 +411,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
introduction.obsoletes[seg.id] = delta introduction.obsoletes[seg.id] = delta
} }
_ = root.DecRef() introStartTime := time.Now()
s.introductions <- introduction s.introductions <- introduction
@ -348,6 +425,12 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
err = <-introduction.persisted err = <-introduction.persisted
} }
introTime := uint64(time.Since(introStartTime))
atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime)
if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime {
atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime)
}
return err return err
} }
@ -366,18 +449,69 @@ func (s *Scorch) DeleteInternal(key []byte) error {
// Reader returns a low-level accessor on the index data. Close it to // Reader returns a low-level accessor on the index data. Close it to
// release associated resources. // release associated resources.
func (s *Scorch) Reader() (index.IndexReader, error) { func (s *Scorch) Reader() (index.IndexReader, error) {
return s.currentSnapshot(), nil
}
func (s *Scorch) currentSnapshot() *IndexSnapshot {
s.rootLock.RLock() s.rootLock.RLock()
rv := &Reader{root: s.root} rv := s.root
rv.root.AddRef() if rv != nil {
rv.AddRef()
}
s.rootLock.RUnlock() s.rootLock.RUnlock()
return rv, nil return rv
} }
func (s *Scorch) Stats() json.Marshaler { func (s *Scorch) Stats() json.Marshaler {
return s.stats return &s.stats
}
func (s *Scorch) diskFileStats() (uint64, uint64) {
var numFilesOnDisk, numBytesUsedDisk uint64
if s.path != "" {
finfos, err := ioutil.ReadDir(s.path)
if err == nil {
for _, finfo := range finfos {
if !finfo.IsDir() {
numBytesUsedDisk += uint64(finfo.Size())
numFilesOnDisk++
}
}
} }
}
return numFilesOnDisk, numBytesUsedDisk
}
func (s *Scorch) StatsMap() map[string]interface{} { func (s *Scorch) StatsMap() map[string]interface{} {
m, _ := s.stats.statsMap() m := s.stats.ToMap()
numFilesOnDisk, numBytesUsedDisk := s.diskFileStats()
m["CurOnDiskBytes"] = numBytesUsedDisk
m["CurOnDiskFiles"] = numFilesOnDisk
// TODO: consider one day removing these backwards compatible
// names for apps using the old names
m["updates"] = m["TotUpdates"]
m["deletes"] = m["TotDeletes"]
m["batches"] = m["TotBatches"]
m["errors"] = m["TotOnErrors"]
m["analysis_time"] = m["TotAnalysisTime"]
m["index_time"] = m["TotIndexTime"]
m["term_searchers_started"] = m["TotTermSearchersStarted"]
m["term_searchers_finished"] = m["TotTermSearchersFinished"]
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"]
m["num_items_introduced"] = m["TotIntroducedItems"]
m["num_items_persisted"] = m["TotPersistedItems"]
m["num_recs_to_persist"] = m["TotItemsToPersist"]
m["num_bytes_used_disk"] = m["CurOnDiskBytes"]
m["num_files_on_disk"] = m["CurOnDiskFiles"]
m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"]
m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"]
m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"]
m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"]
m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"]
return m return m
} }
@ -394,7 +528,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult {
rv.Analyzed[i] = tokenFreqs rv.Analyzed[i] = tokenFreqs
rv.Length[i] = fieldLength rv.Length[i] = fieldLength
if len(d.CompositeFields) > 0 { if len(d.CompositeFields) > 0 && field.Name() != "_id" {
// see if any of the composite fields need this // see if any of the composite fields need this
for _, compositeField := range d.CompositeFields { for _, compositeField := range d.CompositeFields {
compositeField.Compose(field.Name(), fieldLength, tokenFreqs) compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
@ -418,20 +552,43 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) {
s.rootLock.Unlock() s.rootLock.Unlock()
} }
func (s *Scorch) MemoryUsed() uint64 { func (s *Scorch) MemoryUsed() (memUsed uint64) {
var memUsed uint64 indexSnapshot := s.currentSnapshot()
s.rootLock.RLock() if indexSnapshot == nil {
if s.root != nil { return
for _, segmentSnapshot := range s.root.segment {
memUsed += 8 /* size of id -> uint64 */ +
segmentSnapshot.segment.SizeInBytes()
if segmentSnapshot.deleted != nil {
memUsed += segmentSnapshot.deleted.GetSizeInBytes()
} }
memUsed += segmentSnapshot.cachedDocs.sizeInBytes()
defer func() {
_ = indexSnapshot.Close()
}()
// Account for current root snapshot overhead
memUsed += uint64(indexSnapshot.Size())
// Account for snapshot that the persister may be working on
persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch)
persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize)
if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch {
// the snapshot that the persister is working on isn't the same as
// the current snapshot
memUsed += persistSnapshotSize
} }
// Account for snapshot that the merger may be working on
mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch)
mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize)
if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch {
// the snapshot that the merger is working on isn't the same as
// the current snapshot
memUsed += mergeSnapshotSize
} }
s.rootLock.RUnlock()
memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) -
atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved))
memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) -
atomic.LoadUint64(&s.iStats.analysisBytesRemoved))
return memUsed return memUsed
} }
@ -450,3 +607,15 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) {
func init() { func init() {
registry.RegisterIndexType(Name, NewScorch) registry.RegisterIndexType(Name, NewScorch)
} }
func parseToInteger(i interface{}) (int, error) {
switch v := i.(type) {
case float64:
return int(v), nil
case int:
return v, nil
default:
return 0, fmt.Errorf("expects int or float64 value")
}
}

@ -17,6 +17,7 @@ package segment
import ( import (
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/couchbase/vellum"
) )
type EmptySegment struct{} type EmptySegment struct{}
@ -29,6 +30,10 @@ func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisit
return nil return nil
} }
func (e *EmptySegment) DocID(num uint64) ([]byte, error) {
return nil, nil
}
func (e *EmptySegment) Count() uint64 { func (e *EmptySegment) Count() uint64 {
return 0 return 0
} }
@ -46,6 +51,10 @@ func (e *EmptySegment) Close() error {
return nil return nil
} }
func (e *EmptySegment) Size() uint64 {
return 0
}
func (e *EmptySegment) AddRef() { func (e *EmptySegment) AddRef() {
} }
@ -55,8 +64,8 @@ func (e *EmptySegment) DecRef() error {
type EmptyDictionary struct{} type EmptyDictionary struct{}
func (e *EmptyDictionary) PostingsList(term string, func (e *EmptyDictionary) PostingsList(term []byte,
except *roaring.Bitmap) (PostingsList, error) { except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) {
return &EmptyPostingsList{}, nil return &EmptyPostingsList{}, nil
} }
@ -72,18 +81,37 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
return &EmptyDictionaryIterator{} return &EmptyDictionaryIterator{}
} }
func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton,
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte,
includeCount bool) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
type EmptyDictionaryIterator struct{} type EmptyDictionaryIterator struct{}
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
return nil, nil return nil, nil
} }
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) {
return nil, nil
}
type EmptyPostingsList struct{} type EmptyPostingsList struct{}
func (e *EmptyPostingsList) Iterator() PostingsIterator { func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool,
prealloc PostingsIterator) PostingsIterator {
return &EmptyPostingsIterator{} return &EmptyPostingsIterator{}
} }
func (e *EmptyPostingsList) Size() int {
return 0
}
func (e *EmptyPostingsList) Count() uint64 { func (e *EmptyPostingsList) Count() uint64 {
return 0 return 0
} }
@ -93,3 +121,9 @@ type EmptyPostingsIterator struct{}
func (e *EmptyPostingsIterator) Next() (Posting, error) { func (e *EmptyPostingsIterator) Next() (Posting, error) {
return nil, nil return nil, nil
} }
func (e *EmptyPostingsIterator) Size() int {
return 0
}
var AnEmptyPostingsIterator = &EmptyPostingsIterator{}

@ -1,321 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mem
import (
"math"
"sort"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
)
// NewFromAnalyzedDocs places the analyzed document mutations into a new segment
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
s := New()
// ensure that _id field get fieldID 0
s.getOrDefineField("_id")
// fill Dicts/DictKeys and preallocate memory
s.initializeDict(results)
// walk each doc
for _, result := range results {
s.processDocument(result)
}
// go back and sort the dictKeys
for _, dict := range s.DictKeys {
sort.Strings(dict)
}
// compute memory usage of segment
s.updateSizeInBytes()
// professional debugging
//
// log.Printf("fields: %v\n", s.FieldsMap)
// log.Printf("fieldsInv: %v\n", s.FieldsInv)
// log.Printf("fieldsLoc: %v\n", s.FieldsLoc)
// log.Printf("dicts: %v\n", s.Dicts)
// log.Printf("dict keys: %v\n", s.DictKeys)
// for i, posting := range s.Postings {
// log.Printf("posting %d: %v\n", i, posting)
// }
// for i, freq := range s.Freqs {
// log.Printf("freq %d: %v\n", i, freq)
// }
// for i, norm := range s.Norms {
// log.Printf("norm %d: %v\n", i, norm)
// }
// for i, field := range s.Locfields {
// log.Printf("field %d: %v\n", i, field)
// }
// for i, start := range s.Locstarts {
// log.Printf("start %d: %v\n", i, start)
// }
// for i, end := range s.Locends {
// log.Printf("end %d: %v\n", i, end)
// }
// for i, pos := range s.Locpos {
// log.Printf("pos %d: %v\n", i, pos)
// }
// for i, apos := range s.Locarraypos {
// log.Printf("apos %d: %v\n", i, apos)
// }
// log.Printf("stored: %v\n", s.Stored)
// log.Printf("stored types: %v\n", s.StoredTypes)
// log.Printf("stored pos: %v\n", s.StoredPos)
return s
}
// fill Dicts/DictKeys and preallocate memory for postings
func (s *Segment) initializeDict(results []*index.AnalysisResult) {
var numPostingsLists int
numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
var numTokenFrequencies int
var totLocs int
// initial scan for all fieldID's to sort them
for _, result := range results {
for _, field := range result.Document.CompositeFields {
s.getOrDefineField(field.Name())
}
for _, field := range result.Document.Fields {
s.getOrDefineField(field.Name())
}
}
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
for fieldID, fieldName := range s.FieldsInv {
s.FieldsMap[fieldName] = uint16(fieldID + 1)
}
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
for term, tf := range tfs {
pidPlus1, exists := s.Dicts[fieldID][term]
if !exists {
numPostingsLists++
pidPlus1 = uint64(numPostingsLists)
s.Dicts[fieldID][term] = pidPlus1
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term)
numTermsPerPostingsList = append(numTermsPerPostingsList, 0)
numLocsPerPostingsList = append(numLocsPerPostingsList, 0)
}
pid := pidPlus1 - 1
numTermsPerPostingsList[pid] += 1
numLocsPerPostingsList[pid] += len(tf.Locations)
totLocs += len(tf.Locations)
}
numTokenFrequencies += len(tfs)
}
for _, result := range results {
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name()))
_, tf := field.Analyze()
processField(fieldID, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
tf := result.Analyzed[i]
processField(fieldID, tf)
}
}
s.Postings = make([]*roaring.Bitmap, numPostingsLists)
for i := 0; i < numPostingsLists; i++ {
s.Postings[i] = roaring.New()
}
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists)
for i := 0; i < numPostingsLists; i++ {
s.PostingsLocs[i] = roaring.New()
}
// Preallocate big, contiguous backing arrays.
auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos.
uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos.
float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms.
uint16Backing := make([]uint16, totLocs) // For sub-Locfields.
// Point top-level slices to the backing arrays.
s.Freqs = auint64Backing[0:numPostingsLists]
auint64Backing = auint64Backing[numPostingsLists:]
s.Norms = make([][]float32, numPostingsLists)
s.Locfields = make([][]uint16, numPostingsLists)
s.Locstarts = auint64Backing[0:numPostingsLists]
auint64Backing = auint64Backing[numPostingsLists:]
s.Locends = auint64Backing[0:numPostingsLists]
auint64Backing = auint64Backing[numPostingsLists:]
s.Locpos = auint64Backing[0:numPostingsLists]
auint64Backing = auint64Backing[numPostingsLists:]
s.Locarraypos = make([][][]uint64, numPostingsLists)
// Point sub-slices to the backing arrays.
for pid, numTerms := range numTermsPerPostingsList {
s.Freqs[pid] = uint64Backing[0:0]
uint64Backing = uint64Backing[numTerms:]
s.Norms[pid] = float32Backing[0:0]
float32Backing = float32Backing[numTerms:]
}
for pid, numLocs := range numLocsPerPostingsList {
s.Locfields[pid] = uint16Backing[0:0]
uint16Backing = uint16Backing[numLocs:]
s.Locstarts[pid] = uint64Backing[0:0]
uint64Backing = uint64Backing[numLocs:]
s.Locends[pid] = uint64Backing[0:0]
uint64Backing = uint64Backing[numLocs:]
s.Locpos[pid] = uint64Backing[0:0]
uint64Backing = uint64Backing[numLocs:]
s.Locarraypos[pid] = auint64Backing[0:0]
auint64Backing = auint64Backing[numLocs:]
}
}
func (s *Segment) processDocument(result *index.AnalysisResult) {
// used to collate information across fields
docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap))
fieldLens := make(map[uint16]int, len(s.FieldsMap))
docNum := uint64(s.addDocument())
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) {
fieldLens[field] += l
if existingFreqs, ok := docMap[field]; ok {
existingFreqs.MergeAll(name, tf)
} else {
docMap[field] = tf
}
}
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) {
s.Stored[docNum][field] = append(s.Stored[docNum][field], val)
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ)
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos)
}
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name()))
l, tf := field.Analyze()
processField(fieldID, field.Name(), l, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
l := result.Length[i]
tf := result.Analyzed[i]
processField(fieldID, field.Name(), l, tf)
if field.Options().IsStored() {
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions())
}
if field.Options().IncludeDocValues() {
s.DocValueFields[fieldID] = true
}
}
// now that its been rolled up into docMap, walk that
for fieldID, tokenFrequencies := range docMap {
for term, tokenFreq := range tokenFrequencies {
pid := s.Dicts[fieldID][term] - 1
bs := s.Postings[pid]
bs.AddInt(int(docNum))
s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency()))
s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
locationBS := s.PostingsLocs[pid]
if len(tokenFreq.Locations) > 0 {
locationBS.AddInt(int(docNum))
for _, loc := range tokenFreq.Locations {
var locf = fieldID
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field))
}
s.Locfields[pid] = append(s.Locfields[pid], locf)
s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start))
s.Locends[pid] = append(s.Locends[pid], uint64(loc.End))
s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position))
if len(loc.ArrayPositions) > 0 {
s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions)
} else {
s.Locarraypos[pid] = append(s.Locarraypos[pid], nil)
}
}
}
}
}
}
func (s *Segment) getOrDefineField(name string) int {
fieldIDPlus1, ok := s.FieldsMap[name]
if !ok {
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
s.FieldsMap[name] = fieldIDPlus1
s.FieldsInv = append(s.FieldsInv, name)
s.Dicts = append(s.Dicts, make(map[string]uint64))
s.DictKeys = append(s.DictKeys, make([]string, 0))
}
return int(fieldIDPlus1 - 1)
}
func (s *Segment) addDocument() int {
docNum := len(s.Stored)
s.Stored = append(s.Stored, map[uint16][][]byte{})
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{})
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{})
return docNum
}
func encodeFieldType(f document.Field) byte {
fieldType := byte('x')
switch f.(type) {
case *document.TextField:
fieldType = 't'
case *document.NumericField:
fieldType = 'n'
case *document.DateTimeField:
fieldType = 'd'
case *document.BooleanField:
fieldType = 'b'
case *document.GeoPointField:
fieldType = 'g'
case *document.CompositeField:
fieldType = 'c'
}
return fieldType
}

@ -1,103 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mem
import (
"sort"
"strings"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
)
// Dictionary is the in-memory representation of the term dictionary
type Dictionary struct {
segment *Segment
field string
fieldID uint16
}
// PostingsList returns the postings list for the specified term
func (d *Dictionary) PostingsList(term string,
except *roaring.Bitmap) (segment.PostingsList, error) {
return &PostingsList{
dictionary: d,
term: term,
postingsID: d.segment.Dicts[d.fieldID][term],
except: except,
}, nil
}
// Iterator returns an iterator for this dictionary
func (d *Dictionary) Iterator() segment.DictionaryIterator {
return &DictionaryIterator{
d: d,
}
}
// PrefixIterator returns an iterator which only visits terms having the
// the specified prefix
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix)
return &DictionaryIterator{
d: d,
prefix: prefix,
offset: offset,
}
}
// RangeIterator returns an iterator which only visits terms between the
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start)
return &DictionaryIterator{
d: d,
offset: offset,
end: end,
}
}
// DictionaryIterator is an iterator for term dictionary
type DictionaryIterator struct {
d *Dictionary
prefix string
end string
offset int
dictEntry index.DictEntry // reused across Next()'s
}
// Next returns the next entry in the dictionary
func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 {
return nil, nil
}
next := d.d.segment.DictKeys[d.d.fieldID][d.offset]
// check prefix
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) {
return nil, nil
}
// check end (bleve.index API demands inclusive end)
if d.end != "" && next > d.end {
return nil, nil
}
d.offset++
postingID := d.d.segment.Dicts[d.d.fieldID][next]
d.dictEntry.Term = next
d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality()
return &d.dictEntry, nil
}

@ -1,178 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mem
import (
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
)
// PostingsList is an in-memory represenation of a postings list
type PostingsList struct {
dictionary *Dictionary
term string
postingsID uint64
except *roaring.Bitmap
}
// Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 {
var rv uint64
if p.postingsID > 0 {
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality()
if p.except != nil {
except := p.except.GetCardinality()
if except > rv {
// avoid underflow
except = rv
}
rv -= except
}
}
return rv
}
// Iterator returns an iterator for this postings list
func (p *PostingsList) Iterator() segment.PostingsIterator {
rv := &PostingsIterator{
postings: p,
}
if p.postingsID > 0 {
allbits := p.dictionary.segment.Postings[p.postingsID-1]
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1]
rv.all = allbits.Iterator()
if p.except != nil {
allExcept := allbits.Clone()
allExcept.AndNot(p.except)
rv.actual = allExcept.Iterator()
} else {
rv.actual = allbits.Iterator()
}
}
return rv
}
// PostingsIterator provides a way to iterate through the postings list
type PostingsIterator struct {
postings *PostingsList
all roaring.IntIterable
locations *roaring.Bitmap
offset int
locoffset int
actual roaring.IntIterable
}
// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) Next() (segment.Posting, error) {
if i.actual == nil || !i.actual.HasNext() {
return nil, nil
}
n := i.actual.Next()
allN := i.all.Next()
// n is the next actual hit (excluding some postings)
// allN is the next hit in the full postings
// if they don't match, adjust offsets to factor in item we're skipping over
// incr the all iterator, and check again
for allN != n {
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
i.offset++
allN = i.all.Next()
}
rv := &Posting{
iterator: i,
docNum: uint64(n),
offset: i.offset,
locoffset: i.locoffset,
hasLoc: i.locations.Contains(n),
}
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
i.offset++
return rv, nil
}
// Posting is a single entry in a postings list
type Posting struct {
iterator *PostingsIterator
docNum uint64
offset int
locoffset int
hasLoc bool
}
// Number returns the document number of this posting in this segment
func (p *Posting) Number() uint64 {
return p.docNum
}
// Frequency returns the frequence of occurance of this term in this doc/field
func (p *Posting) Frequency() uint64 {
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset]
}
// Norm returns the normalization factor for this posting
func (p *Posting) Norm() float64 {
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset])
}
// Locations returns the location information for each occurance
func (p *Posting) Locations() []segment.Location {
if !p.hasLoc {
return nil
}
freq := int(p.Frequency())
rv := make([]segment.Location, freq)
for i := 0; i < freq; i++ {
rv[i] = &Location{
p: p,
offset: p.locoffset + i,
}
}
return rv
}
// Location represents the location of a single occurance
type Location struct {
p *Posting
offset int
}
// Field returns the name of the field (useful in composite fields to know
// which original field the value came from)
func (l *Location) Field() string {
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]]
}
// Start returns the start byte offset of this occurance
func (l *Location) Start() uint64 {
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset]
}
// End returns the end byte offset of this occurance
func (l *Location) End() uint64 {
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset]
}
// Pos returns the 1-based phrase position of this occurance
func (l *Location) Pos() uint64 {
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset]
}
// ArrayPositions returns the array position vector associated with this occurance
func (l *Location) ArrayPositions() []uint64 {
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
}

@ -1,289 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mem
import (
"fmt"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
)
// _id field is always guaranteed to have fieldID of 0
const idFieldID uint16 = 0
// KNOWN ISSUES
// - LIMITATION - we decided whether or not to store term vectors for a field
// at the segment level, based on the first definition of a
// field we see. in normal bleve usage this is fine, all
// instances of a field definition will be the same. however,
// advanced users may violate this and provide unique field
// definitions with each document. this segment does not
// support this usage.
// TODO
// - need better testing of multiple docs, iterating freqs, locations and
// and verifying the correct results are returned
// Segment is an in memory implementation of scorch.Segment
type Segment struct {
// FieldsMap adds 1 to field id to avoid zero value issues
// name -> field id + 1
FieldsMap map[string]uint16
// FieldsInv is the inverse of FieldsMap
// field id -> name
FieldsInv []string
// Term dictionaries for each field
// field id -> term -> postings list id + 1
Dicts []map[string]uint64
// Terms for each field, where terms are sorted ascending
// field id -> []term
DictKeys [][]string
// Postings list
// postings list id -> bitmap by docNum
Postings []*roaring.Bitmap
// Postings list has locations
PostingsLocs []*roaring.Bitmap
// Term frequencies
// postings list id -> Freqs (one for each hit in bitmap)
Freqs [][]uint64
// Field norms
// postings list id -> Norms (one for each hit in bitmap)
Norms [][]float32
// Field/start/end/pos/locarraypos
// postings list id -> start/end/pos/locarraypos (one for each freq)
Locfields [][]uint16
Locstarts [][]uint64
Locends [][]uint64
Locpos [][]uint64
Locarraypos [][][]uint64
// Stored field values
// docNum -> field id -> slice of values (each value []byte)
Stored []map[uint16][][]byte
// Stored field types
// docNum -> field id -> slice of types (each type byte)
StoredTypes []map[uint16][]byte
// Stored field array positions
// docNum -> field id -> slice of array positions (each is []uint64)
StoredPos []map[uint16][][]uint64
// For storing the docValue persisted fields
DocValueFields map[uint16]bool
// Footprint of the segment, updated when analyzed document mutations
// are added into the segment
sizeInBytes uint64
}
// New builds a new empty Segment
func New() *Segment {
return &Segment{
FieldsMap: map[string]uint16{},
DocValueFields: map[uint16]bool{},
}
}
func (s *Segment) updateSizeInBytes() {
var sizeInBytes uint64
// FieldsMap, FieldsInv
for k, _ := range s.FieldsMap {
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
2 /* size of uint16 */)
}
// overhead from the data structures
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
// Dicts, DictKeys
for _, entry := range s.Dicts {
for k, _ := range entry {
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
8 /* size of uint64 */)
}
// overhead from the data structures
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
}
sizeInBytes += (segment.SizeOfSlice * 2)
// Postings, PostingsLocs
for i := 0; i < len(s.Postings); i++ {
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) +
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer)
}
sizeInBytes += (segment.SizeOfSlice * 2)
// Freqs, Norms
for i := 0; i < len(s.Freqs); i++ {
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ +
len(s.Norms[i])*4 /* size of float32 */) +
(segment.SizeOfSlice * 2)
}
sizeInBytes += (segment.SizeOfSlice * 2)
// Location data
for i := 0; i < len(s.Locfields); i++ {
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ +
len(s.Locstarts[i])*8 /* size of uint64 */ +
len(s.Locends[i])*8 /* size of uint64 */ +
len(s.Locpos[i])*8 /* size of uint64 */)
for j := 0; j < len(s.Locarraypos[i]); j++ {
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) +
segment.SizeOfSlice
}
sizeInBytes += (segment.SizeOfSlice * 5)
}
sizeInBytes += (segment.SizeOfSlice * 5)
// Stored data
for i := 0; i < len(s.Stored); i++ {
for _, v := range s.Stored[i] {
sizeInBytes += uint64(2 /* size of uint16 */)
for _, arr := range v {
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice
}
sizeInBytes += segment.SizeOfSlice
}
for _, v := range s.StoredTypes[i] {
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice
}
for _, v := range s.StoredPos[i] {
sizeInBytes += uint64(2 /* size of uint16 */)
for _, arr := range v {
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) +
segment.SizeOfSlice
}
sizeInBytes += segment.SizeOfSlice
}
// overhead from map(s) within Stored, StoredTypes, StoredPos
sizeInBytes += (segment.SizeOfMap * 3)
}
// overhead from data structures: Stored, StoredTypes, StoredPos
sizeInBytes += (segment.SizeOfSlice * 3)
// DocValueFields
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) +
segment.SizeOfMap
// SizeInBytes
sizeInBytes += uint64(8)
s.sizeInBytes = sizeInBytes
}
func (s *Segment) SizeInBytes() uint64 {
return s.sizeInBytes
}
func (s *Segment) AddRef() {
}
func (s *Segment) DecRef() error {
return nil
}
// Fields returns the field names used in this segment
func (s *Segment) Fields() []string {
return s.FieldsInv
}
// VisitDocument invokes the DocFieldValueVistor for each stored field
// for the specified doc number
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
// ensure document number exists
if int(num) > len(s.Stored)-1 {
return nil
}
docFields := s.Stored[int(num)]
st := s.StoredTypes[int(num)]
sp := s.StoredPos[int(num)]
for field, values := range docFields {
for i, value := range values {
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i])
if !keepGoing {
return nil
}
}
}
return nil
}
func (s *Segment) getField(name string) (int, error) {
fieldID, ok := s.FieldsMap[name]
if !ok {
return 0, fmt.Errorf("no field named %s", name)
}
return int(fieldID - 1), nil
}
// Dictionary returns the term dictionary for the specified field
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
fieldID, err := s.getField(field)
if err != nil {
// no such field, return empty dictionary
return &segment.EmptyDictionary{}, nil
}
return &Dictionary{
segment: s,
field: field,
fieldID: uint16(fieldID),
}, nil
}
// Count returns the number of documents in this segment
// (this has no notion of deleted docs)
func (s *Segment) Count() uint64 {
return uint64(len(s.Stored))
}
// DocNumbers returns a bitset corresponding to the doc numbers of all the
// provided _id strings
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
rv := roaring.New()
// guard against empty segment
if len(s.FieldsMap) > 0 {
idDictionary := s.Dicts[idFieldID]
for _, id := range ids {
postingID := idDictionary[id]
if postingID > 0 {
rv.Or(s.Postings[postingID-1])
}
}
}
return rv, nil
}
// Close releases all resources associated with this segment
func (s *Segment) Close() error {
return nil
}

@ -0,0 +1,75 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package segment
import (
"regexp/syntax"
"github.com/couchbase/vellum/regexp"
)
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) {
// TODO: potential optimization where syntax.Regexp supports a Simplify() API?
parsed, err := syntax.Parse(pattern, syntax.Perl)
if err != nil {
return nil, nil, nil, err
}
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit)
if err != nil {
return nil, nil, nil, err
}
prefix := LiteralPrefix(parsed)
if prefix != "" {
prefixBeg := []byte(prefix)
prefixEnd := IncrementBytes(prefixBeg)
return re, prefixBeg, prefixEnd, nil
}
return re, nil, nil, nil
}
// Returns the literal prefix given the parse tree for a regexp
func LiteralPrefix(s *syntax.Regexp) string {
// traverse the left-most branch in the parse tree as long as the
// node represents a concatenation
for s != nil && s.Op == syntax.OpConcat {
if len(s.Sub) < 1 {
return ""
}
s = s.Sub[0]
}
if s.Op == syntax.OpLiteral {
return string(s.Rune)
}
return "" // no literal prefix
}
func IncrementBytes(in []byte) []byte {
rv := make([]byte, len(in))
copy(rv, in)
for i := len(rv) - 1; i >= 0; i-- {
rv[i] = rv[i] + 1
if rv[i] != 0 {
return rv // didn't overflow, so stop
}
}
return nil // overflowed
}

@ -15,15 +15,14 @@
package segment package segment
import ( import (
"fmt"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/couchbase/vellum"
) )
// Overhead from go data structures when deployed on a 64-bit system. var ErrClosed = fmt.Errorf("index closed")
const SizeOfMap uint64 = 8
const SizeOfPointer uint64 = 8
const SizeOfSlice uint64 = 24
const SizeOfString uint64 = 16
// DocumentFieldValueVisitor defines a callback to be visited for each // DocumentFieldValueVisitor defines a callback to be visited for each
// stored field value. The return value determines if the visitor // stored field value. The return value determines if the visitor
@ -34,6 +33,9 @@ type Segment interface {
Dictionary(field string) (TermDictionary, error) Dictionary(field string) (TermDictionary, error)
VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error
DocID(num uint64) ([]byte, error)
Count() uint64 Count() uint64
DocNumbers([]string) (*roaring.Bitmap, error) DocNumbers([]string) (*roaring.Bitmap, error)
@ -42,18 +44,21 @@ type Segment interface {
Close() error Close() error
SizeInBytes() uint64 Size() int
AddRef() AddRef()
DecRef() error DecRef() error
} }
type TermDictionary interface { type TermDictionary interface {
PostingsList(term string, except *roaring.Bitmap) (PostingsList, error) PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error)
Iterator() DictionaryIterator Iterator() DictionaryIterator
PrefixIterator(prefix string) DictionaryIterator PrefixIterator(prefix string) DictionaryIterator
RangeIterator(start, end string) DictionaryIterator RangeIterator(start, end string) DictionaryIterator
AutomatonIterator(a vellum.Automaton,
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator
OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator
} }
type DictionaryIterator interface { type DictionaryIterator interface {
@ -61,7 +66,9 @@ type DictionaryIterator interface {
} }
type PostingsList interface { type PostingsList interface {
Iterator() PostingsIterator Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator
Size() int
Count() uint64 Count() uint64
@ -77,6 +84,14 @@ type PostingsIterator interface {
// implementations may return a shared instance to reduce memory // implementations may return a shared instance to reduce memory
// allocations. // allocations.
Next() (Posting, error) Next() (Posting, error)
// Advance will return the posting with the specified doc number
// or if there is no such posting, the next posting.
// Callers MUST NOT attempt to pass a docNum that is less than or
// equal to the currently visited posting doc Num.
Advance(docNum uint64) (Posting, error)
Size() int
} }
type Posting interface { type Posting interface {
@ -86,6 +101,8 @@ type Posting interface {
Norm() float64 Norm() float64
Locations() []Location Locations() []Location
Size() int
} }
type Location interface { type Location interface {
@ -94,6 +111,7 @@ type Location interface {
End() uint64 End() uint64
Pos() uint64 Pos() uint64
ArrayPositions() []uint64 ArrayPositions() []uint64
Size() int
} }
// DocumentFieldTermVisitable is implemented by various scorch segment // DocumentFieldTermVisitable is implemented by various scorch segment
@ -101,10 +119,17 @@ type Location interface {
// postings or other indexed values. // postings or other indexed values.
type DocumentFieldTermVisitable interface { type DocumentFieldTermVisitable interface {
VisitDocumentFieldTerms(localDocNum uint64, fields []string, VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor) error visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error)
// VisitableDocValueFields implementation should return // VisitableDocValueFields implementation should return
// the list of fields which are document value persisted and // the list of fields which are document value persisted and
// therefore visitable by the above VisitDocumentFieldTerms method. // therefore visitable by the above VisitDocumentFieldTerms method.
VisitableDocValueFields() ([]string, error) VisitableDocValueFields() ([]string, error)
} }
type DocVisitState interface {
}
type StatsReporter interface {
ReportBytesWritten(bytesWritten uint64)
}

@ -16,19 +16,13 @@ package zap
import ( import (
"bufio" "bufio"
"bytes"
"encoding/binary"
"math" "math"
"os" "os"
"sort"
"github.com/Smerity/govarint"
"github.com/blevesearch/bleve/index/scorch/segment/mem"
"github.com/couchbase/vellum"
"github.com/golang/snappy"
) )
const version uint32 = 3 const Version uint32 = 11
const Type string = "zap"
const fieldNotUninverted = math.MaxUint64 const fieldNotUninverted = math.MaxUint64
@ -82,219 +76,39 @@ func PersistSegmentBase(sb *SegmentBase, path string) error {
return nil return nil
} }
// PersistSegment takes the in-memory segment and persists it to
// the specified path in the zap file format.
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error {
flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600)
if err != nil {
return err
}
cleanup := func() {
_ = f.Close()
_ = os.Remove(path)
}
// buffer the output
br := bufio.NewWriter(f)
// wrap it for counting (tracking offsets)
cr := NewCountHashWriter(br)
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err :=
persistBase(memSegment, cr, chunkFactor)
if err != nil {
cleanup()
return err
}
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset,
chunkFactor, cr.Sum32(), cr)
if err != nil {
cleanup()
return err
}
err = br.Flush()
if err != nil {
cleanup()
return err
}
err = f.Sync()
if err != nil {
cleanup()
return err
}
err = f.Close()
if err != nil {
cleanup()
return err
}
return nil
}
func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) (
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
dictLocs []uint64, err error) {
docValueOffset = uint64(fieldNotUninverted)
if len(memSegment.Stored) > 0 {
storedIndexOffset, err = persistStored(memSegment, cr)
if err != nil {
return 0, 0, 0, 0, nil, err
}
freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor)
if err != nil {
return 0, 0, 0, 0, nil, err
}
postingsListLocs, err := persistPostingsLocs(memSegment, cr)
if err != nil {
return 0, 0, 0, 0, nil, err
}
postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
if err != nil {
return 0, 0, 0, 0, nil, err
}
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
if err != nil {
return 0, 0, 0, 0, nil, err
}
docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor)
if err != nil {
return 0, 0, 0, 0, nil, err
}
} else {
dictLocs = make([]uint64, len(memSegment.FieldsInv))
}
fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs)
if err != nil {
return 0, 0, 0, 0, nil, err
}
return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset,
dictLocs, nil
}
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
var curr int
var metaBuf bytes.Buffer
var data, compressed []byte
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
docNumOffsets := make(map[int]uint64, len(memSegment.Stored))
for docNum, storedValues := range memSegment.Stored {
if docNum != 0 {
// reset buffer if necessary
curr = 0
metaBuf.Reset()
data = data[:0]
compressed = compressed[:0]
}
st := memSegment.StoredTypes[docNum]
sp := memSegment.StoredPos[docNum]
// encode fields in order
for fieldID := range memSegment.FieldsInv {
if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok {
stf := st[uint16(fieldID)]
spf := sp[uint16(fieldID)]
var err2 error
curr, data, err2 = persistStoredFieldValues(fieldID,
storedFieldValues, stf, spf, curr, metaEncoder, data)
if err2 != nil {
return 0, err2
}
}
}
metaEncoder.Close()
metaBytes := metaBuf.Bytes()
// compress the data
compressed = snappy.Encode(compressed, data)
// record where we're about to start writing
docNumOffsets[docNum] = uint64(w.Count())
// write out the meta len and compressed data len
_, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
if err != nil {
return 0, err
}
// now write the meta
_, err = w.Write(metaBytes)
if err != nil {
return 0, err
}
// now write the compressed data
_, err = w.Write(compressed)
if err != nil {
return 0, err
}
}
// return value is the start of the stored index
rv := uint64(w.Count())
// now write out the stored doc index
for docNum := range memSegment.Stored {
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
if err != nil {
return 0, err
}
}
return rv, nil
}
func persistStoredFieldValues(fieldID int, func persistStoredFieldValues(fieldID int,
storedFieldValues [][]byte, stf []byte, spf [][]uint64, storedFieldValues [][]byte, stf []byte, spf [][]uint64,
curr int, metaEncoder *govarint.Base128Encoder, data []byte) ( curr int, metaEncode varintEncoder, data []byte) (
int, []byte, error) { int, []byte, error) {
for i := 0; i < len(storedFieldValues); i++ { for i := 0; i < len(storedFieldValues); i++ {
// encode field // encode field
_, err := metaEncoder.PutU64(uint64(fieldID)) _, err := metaEncode(uint64(fieldID))
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
// encode type // encode type
_, err = metaEncoder.PutU64(uint64(stf[i])) _, err = metaEncode(uint64(stf[i]))
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
// encode start offset // encode start offset
_, err = metaEncoder.PutU64(uint64(curr)) _, err = metaEncode(uint64(curr))
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
// end len // end len
_, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) _, err = metaEncode(uint64(len(storedFieldValues[i])))
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
// encode number of array pos // encode number of array pos
_, err = metaEncoder.PutU64(uint64(len(spf[i]))) _, err = metaEncode(uint64(len(spf[i])))
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
// encode all array positions // encode all array positions
for _, pos := range spf[i] { for _, pos := range spf[i] {
_, err = metaEncoder.PutU64(pos) _, err = metaEncode(pos)
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
@ -307,337 +121,6 @@ func persistStoredFieldValues(fieldID int,
return curr, data, nil return curr, data, nil
} }
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
var freqOffsets, locOfffsets []uint64
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
for postingID := range memSegment.Postings {
if postingID != 0 {
tfEncoder.Reset()
}
freqs := memSegment.Freqs[postingID]
norms := memSegment.Norms[postingID]
postingsListItr := memSegment.Postings[postingID].Iterator()
var offset int
for postingsListItr.HasNext() {
docNum := uint64(postingsListItr.Next())
// put freq
err := tfEncoder.Add(docNum, freqs[offset])
if err != nil {
return nil, nil, err
}
// put norm
norm := norms[offset]
normBits := math.Float32bits(norm)
err = tfEncoder.Add(docNum, uint64(normBits))
if err != nil {
return nil, nil, err
}
offset++
}
// record where this postings freq info starts
freqOffsets = append(freqOffsets, uint64(w.Count()))
tfEncoder.Close()
_, err := tfEncoder.Write(w)
if err != nil {
return nil, nil, err
}
}
// now do it again for the locations
locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
for postingID := range memSegment.Postings {
if postingID != 0 {
locEncoder.Reset()
}
freqs := memSegment.Freqs[postingID]
locfields := memSegment.Locfields[postingID]
locpos := memSegment.Locpos[postingID]
locstarts := memSegment.Locstarts[postingID]
locends := memSegment.Locends[postingID]
locarraypos := memSegment.Locarraypos[postingID]
postingsListItr := memSegment.Postings[postingID].Iterator()
var offset int
var locOffset int
for postingsListItr.HasNext() {
docNum := uint64(postingsListItr.Next())
for i := 0; i < int(freqs[offset]); i++ {
if len(locfields) > 0 {
// put field
err := locEncoder.Add(docNum, uint64(locfields[locOffset]))
if err != nil {
return nil, nil, err
}
// put pos
err = locEncoder.Add(docNum, locpos[locOffset])
if err != nil {
return nil, nil, err
}
// put start
err = locEncoder.Add(docNum, locstarts[locOffset])
if err != nil {
return nil, nil, err
}
// put end
err = locEncoder.Add(docNum, locends[locOffset])
if err != nil {
return nil, nil, err
}
// put the number of array positions to follow
num := len(locarraypos[locOffset])
err = locEncoder.Add(docNum, uint64(num))
if err != nil {
return nil, nil, err
}
// put each array position
for _, pos := range locarraypos[locOffset] {
err = locEncoder.Add(docNum, pos)
if err != nil {
return nil, nil, err
}
}
}
locOffset++
}
offset++
}
// record where this postings loc info starts
locOfffsets = append(locOfffsets, uint64(w.Count()))
locEncoder.Close()
_, err := locEncoder.Write(w)
if err != nil {
return nil, nil, err
}
}
return freqOffsets, locOfffsets, nil
}
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) {
rv = make([]uint64, 0, len(memSegment.PostingsLocs))
var reuseBuf bytes.Buffer
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
for postingID := range memSegment.PostingsLocs {
// record where we start this posting loc
rv = append(rv, uint64(w.Count()))
// write out the length and bitmap
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint)
if err != nil {
return nil, err
}
}
return rv, nil
}
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) {
rv = make([]uint64, 0, len(memSegment.Postings))
var reuseBuf bytes.Buffer
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
for postingID := range memSegment.Postings {
// record where we start this posting list
rv = append(rv, uint64(w.Count()))
// write out the term info, loc info, and loc posting list offset
_, err = writeUvarints(w, freqOffsets[postingID],
locOffsets[postingID], postingsListLocs[postingID])
if err != nil {
return nil, err
}
// write out the length and bitmap
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint)
if err != nil {
return nil, err
}
}
return rv, nil
}
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
rv := make([]uint64, 0, len(memSegment.DictKeys))
varintBuf := make([]byte, binary.MaxVarintLen64)
var buffer bytes.Buffer
for fieldID, fieldTerms := range memSegment.DictKeys {
if fieldID != 0 {
buffer.Reset()
}
// start a new vellum for this field
builder, err := vellum.New(&buffer, nil)
if err != nil {
return nil, err
}
dict := memSegment.Dicts[fieldID]
// now walk the dictionary in order of fieldTerms (already sorted)
for _, fieldTerm := range fieldTerms {
postingID := dict[fieldTerm] - 1
postingsAddr := postingsLocs[postingID]
err = builder.Insert([]byte(fieldTerm), postingsAddr)
if err != nil {
return nil, err
}
}
err = builder.Close()
if err != nil {
return nil, err
}
// record where this dictionary starts
rv = append(rv, uint64(w.Count()))
vellumData := buffer.Bytes()
// write out the length of the vellum data
n := binary.PutUvarint(varintBuf, uint64(len(vellumData)))
_, err = w.Write(varintBuf[:n])
if err != nil {
return nil, err
}
// write this vellum to disk
_, err = w.Write(vellumData)
if err != nil {
return nil, err
}
}
return rv, nil
}
type docIDRange []uint64
func (a docIDRange) Len() int { return len(a) }
func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] }
func persistDocValues(memSegment *mem.Segment, w *CountHashWriter,
chunkFactor uint32) (map[uint16]uint64, error) {
fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv))
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
for fieldID := range memSegment.DocValueFields {
field := memSegment.FieldsInv[fieldID]
docTermMap := make(map[uint64][]byte, 0)
dict, err := memSegment.Dictionary(field)
if err != nil {
return nil, err
}
dictItr := dict.Iterator()
next, err := dictItr.Next()
for err == nil && next != nil {
postings, err1 := dict.PostingsList(next.Term, nil)
if err1 != nil {
return nil, err
}
postingsItr := postings.Iterator()
nextPosting, err2 := postingsItr.Next()
for err2 == nil && nextPosting != nil {
docNum := nextPosting.Number()
docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...)
docTermMap[docNum] = append(docTermMap[docNum], termSeparator)
nextPosting, err2 = postingsItr.Next()
}
if err2 != nil {
return nil, err2
}
next, err = dictItr.Next()
}
if err != nil {
return nil, err
}
// sort wrt to docIDs
var docNumbers docIDRange
for k := range docTermMap {
docNumbers = append(docNumbers, k)
}
sort.Sort(docNumbers)
for _, docNum := range docNumbers {
err = fdvEncoder.Add(docNum, docTermMap[docNum])
if err != nil {
return nil, err
}
}
fieldChunkOffsets[fieldID] = uint64(w.Count())
err = fdvEncoder.Close()
if err != nil {
return nil, err
}
// persist the doc value details for this field
_, err = fdvEncoder.Write(w)
if err != nil {
return nil, err
}
// reseting encoder for the next field
fdvEncoder.Reset()
}
return fieldChunkOffsets, nil
}
func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter,
chunkFactor uint32) (uint64, error) {
fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor)
if err != nil {
return 0, err
}
fieldDocValuesOffset := uint64(w.Count())
buf := make([]byte, binary.MaxVarintLen64)
offset := uint64(0)
ok := true
for fieldID := range memSegment.FieldsInv {
// if the field isn't configured for docValue, then mark
// the offset accordingly
if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok {
offset = fieldNotUninverted
}
n := binary.PutUvarint(buf, uint64(offset))
_, err := w.Write(buf[:n])
if err != nil {
return 0, err
}
}
return fieldDocValuesOffset, nil
}
func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) {
var br bytes.Buffer
cr := NewCountHashWriter(&br)
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err :=
persistBase(memSegment, cr, chunkFactor)
if err != nil {
return nil, err
}
return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
memSegment.FieldsMap, memSegment.FieldsInv, numDocs,
storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs)
}
func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
@ -653,10 +136,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
fieldsIndexOffset: fieldsIndexOffset, fieldsIndexOffset: fieldsIndexOffset,
docValueOffset: docValueOffset, docValueOffset: docValueOffset,
dictLocs: dictLocs, dictLocs: dictLocs,
fieldDvIterMap: make(map[uint16]*docValueIterator), fieldDvReaders: make(map[uint16]*docValueReader),
} }
sb.updateSize()
err := sb.loadDvIterators() err := sb.loadDvReaders()
if err != nil { if err != nil {
return nil, err return nil, err
} }

@ -18,10 +18,18 @@ import (
"bytes" "bytes"
"encoding/binary" "encoding/binary"
"io" "io"
"reflect"
"github.com/golang/snappy" "github.com/golang/snappy"
) )
var reflectStaticSizeMetaData int
func init() {
var md MetaData
reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size())
}
var termSeparator byte = 0xff var termSeparator byte = 0xff
var termSeparatorSplitSlice = []byte{termSeparator} var termSeparatorSplitSlice = []byte{termSeparator}
@ -30,29 +38,36 @@ type chunkedContentCoder struct {
chunkSize uint64 chunkSize uint64
currChunk uint64 currChunk uint64
chunkLens []uint64 chunkLens []uint64
w io.Writer
progressiveWrite bool
chunkMetaBuf bytes.Buffer chunkMetaBuf bytes.Buffer
chunkBuf bytes.Buffer chunkBuf bytes.Buffer
chunkMeta []MetaData chunkMeta []MetaData
compressed []byte // temp buf for snappy compression
} }
// MetaData represents the data information inside a // MetaData represents the data information inside a
// chunk. // chunk.
type MetaData struct { type MetaData struct {
DocNum uint64 // docNum of the data inside the chunk DocNum uint64 // docNum of the data inside the chunk
DocDvLoc uint64 // starting offset for a given docid DocDvOffset uint64 // offset of data inside the chunk for the given docid
DocDvLen uint64 // length of data inside the chunk for the given docid
} }
// newChunkedContentCoder returns a new chunk content coder which // newChunkedContentCoder returns a new chunk content coder which
// packs data into chunks based on the provided chunkSize // packs data into chunks based on the provided chunkSize
func newChunkedContentCoder(chunkSize uint64, func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64,
maxDocNum uint64) *chunkedContentCoder { w io.Writer, progressiveWrite bool) *chunkedContentCoder {
total := maxDocNum/chunkSize + 1 total := maxDocNum/chunkSize + 1
rv := &chunkedContentCoder{ rv := &chunkedContentCoder{
chunkSize: chunkSize, chunkSize: chunkSize,
chunkLens: make([]uint64, total), chunkLens: make([]uint64, total),
chunkMeta: make([]MetaData, 0, total), chunkMeta: make([]MetaData, 0, total),
w: w,
progressiveWrite: progressiveWrite,
} }
return rv return rv
@ -88,7 +103,7 @@ func (c *chunkedContentCoder) flushContents() error {
// write out the metaData slice // write out the metaData slice
for _, meta := range c.chunkMeta { for _, meta := range c.chunkMeta {
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen) _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
if err != nil { if err != nil {
return err return err
} }
@ -98,10 +113,19 @@ func (c *chunkedContentCoder) flushContents() error {
metaData := c.chunkMetaBuf.Bytes() metaData := c.chunkMetaBuf.Bytes()
c.final = append(c.final, c.chunkMetaBuf.Bytes()...) c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
// write the compressed data to the final data // write the compressed data to the final data
compressedData := snappy.Encode(nil, c.chunkBuf.Bytes()) c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
c.final = append(c.final, compressedData...) c.final = append(c.final, c.compressed...)
c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))
if c.progressiveWrite {
_, err := c.w.Write(c.final)
if err != nil {
return err
}
c.final = c.final[:0]
}
c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData))
return nil return nil
} }
@ -122,7 +146,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
c.currChunk = chunk c.currChunk = chunk
} }
// mark the starting offset for this doc // get the starting offset for this doc
dvOffset := c.chunkBuf.Len() dvOffset := c.chunkBuf.Len()
dvSize, err := c.chunkBuf.Write(vals) dvSize, err := c.chunkBuf.Write(vals)
if err != nil { if err != nil {
@ -131,37 +155,76 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
c.chunkMeta = append(c.chunkMeta, MetaData{ c.chunkMeta = append(c.chunkMeta, MetaData{
DocNum: docNum, DocNum: docNum,
DocDvLoc: uint64(dvOffset), DocDvOffset: uint64(dvOffset + dvSize),
DocDvLen: uint64(dvSize),
}) })
return nil return nil
} }
// Write commits all the encoded chunked contents to the provided writer. // Write commits all the encoded chunked contents to the provided writer.
func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { //
// | ..... data ..... | chunk offsets (varints)
// | position of chunk offsets (uint64) | number of offsets (uint64) |
//
func (c *chunkedContentCoder) Write() (int, error) {
var tw int var tw int
buf := make([]byte, binary.MaxVarintLen64)
// write out the number of chunks if c.final != nil {
n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) // write out the data section first
nw, err := w.Write(buf[:n]) nw, err := c.w.Write(c.final)
tw += nw tw += nw
if err != nil { if err != nil {
return tw, err return tw, err
} }
// write out the chunk lens }
for _, chunkLen := range c.chunkLens {
n := binary.PutUvarint(buf, uint64(chunkLen)) chunkOffsetsStart := uint64(tw)
nw, err = w.Write(buf[:n])
if cap(c.final) < binary.MaxVarintLen64 {
c.final = make([]byte, binary.MaxVarintLen64)
} else {
c.final = c.final[0:binary.MaxVarintLen64]
}
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
// write out the chunk offsets
for _, chunkOffset := range chunkOffsets {
n := binary.PutUvarint(c.final, chunkOffset)
nw, err := c.w.Write(c.final[:n])
tw += nw tw += nw
if err != nil { if err != nil {
return tw, err return tw, err
} }
} }
// write out the data
nw, err = w.Write(c.final) chunkOffsetsLen := uint64(tw) - chunkOffsetsStart
c.final = c.final[0:8]
// write out the length of chunk offsets
binary.BigEndian.PutUint64(c.final, chunkOffsetsLen)
nw, err := c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
// write out the number of chunks
binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens)))
nw, err = c.w.Write(c.final)
tw += nw tw += nw
if err != nil { if err != nil {
return tw, err return tw, err
} }
c.final = c.final[:0]
return tw, nil return tw, nil
} }
// ReadDocValueBoundary elicits the start, end offsets from a
// metaData header slice
func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) {
var start uint64
if chunk > 0 {
start = metaHeaders[chunk-1].DocDvOffset
}
return start, metaHeaders[chunk].DocDvOffset
}

@ -17,6 +17,8 @@ package zap
import ( import (
"hash/crc32" "hash/crc32"
"io" "io"
"github.com/blevesearch/bleve/index/scorch/segment"
) )
// CountHashWriter is a wrapper around a Writer which counts the number of // CountHashWriter is a wrapper around a Writer which counts the number of
@ -25,6 +27,7 @@ type CountHashWriter struct {
w io.Writer w io.Writer
crc uint32 crc uint32
n int n int
s segment.StatsReporter
} }
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer // NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
@ -32,11 +35,18 @@ func NewCountHashWriter(w io.Writer) *CountHashWriter {
return &CountHashWriter{w: w} return &CountHashWriter{w: w}
} }
func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter {
return &CountHashWriter{w: w, s: s}
}
// Write writes the provided bytes to the wrapped writer and counts the bytes // Write writes the provided bytes to the wrapped writer and counts the bytes
func (c *CountHashWriter) Write(b []byte) (int, error) { func (c *CountHashWriter) Write(b []byte) (int, error) {
n, err := c.w.Write(b) n, err := c.w.Write(b)
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n])
c.n += n c.n += n
if c.s != nil {
c.s.ReportBytesWritten(uint64(n))
}
return n, err return n, err
} }

@ -15,13 +15,13 @@
package zap package zap
import ( import (
"bytes"
"fmt" "fmt"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum" "github.com/couchbase/vellum"
"github.com/couchbase/vellum/regexp"
) )
// Dictionary is the zap representation of the term dictionary // Dictionary is the zap representation of the term dictionary
@ -30,23 +30,36 @@ type Dictionary struct {
field string field string
fieldID uint16 fieldID uint16
fst *vellum.FST fst *vellum.FST
fstReader *vellum.Reader
} }
// PostingsList returns the postings list for the specified term // PostingsList returns the postings list for the specified term
func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap,
return d.postingsList([]byte(term), except, nil) prealloc segment.PostingsList) (segment.PostingsList, error) {
var preallocPL *PostingsList
pl, ok := prealloc.(*PostingsList)
if ok && pl != nil {
preallocPL = pl
}
return d.postingsList(term, except, preallocPL)
} }
func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
if d.fst == nil { if d.fstReader == nil {
if rv == nil || rv == emptyPostingsList {
return emptyPostingsList, nil
}
return d.postingsListInit(rv, except), nil return d.postingsListInit(rv, except), nil
} }
postingsOffset, exists, err := d.fst.Get(term) postingsOffset, exists, err := d.fstReader.Get(term)
if err != nil { if err != nil {
return nil, fmt.Errorf("vellum err: %v", err) return nil, fmt.Errorf("vellum err: %v", err)
} }
if !exists { if !exists {
if rv == nil || rv == emptyPostingsList {
return emptyPostingsList, nil
}
return d.postingsListInit(rv, except), nil return d.postingsListInit(rv, except), nil
} }
@ -65,10 +78,17 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari
} }
func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList {
if rv == nil { if rv == nil || rv == emptyPostingsList {
rv = &PostingsList{} rv = &PostingsList{}
} else { } else {
postings := rv.postings
if postings != nil {
postings.Clear()
}
*rv = PostingsList{} // clear the struct *rv = PostingsList{} // clear the struct
rv.postings = postings
} }
rv.sb = d.sb rv.sb = d.sb
rv.except = except rv.except = except
@ -85,6 +105,8 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator {
itr, err := d.fst.Iterator(nil, nil) itr, err := d.fst.Iterator(nil, nil)
if err == nil { if err == nil {
rv.itr = itr rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
} }
} }
@ -98,13 +120,15 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
d: d, d: d,
} }
kBeg := []byte(prefix)
kEnd := segment.IncrementBytes(kBeg)
if d.fst != nil { if d.fst != nil {
r, err := regexp.New(prefix + ".*") itr, err := d.fst.Iterator(kBeg, kEnd)
if err == nil {
itr, err := d.fst.Search(r, nil, nil)
if err == nil { if err == nil {
rv.itr = itr rv.itr = itr
} } else if err != vellum.ErrIteratorDone {
rv.err = err
} }
} }
@ -130,7 +154,72 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
itr, err := d.fst.Iterator([]byte(start), endBytes) itr, err := d.fst.Iterator([]byte(start), endBytes)
if err == nil { if err == nil {
rv.itr = itr rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
}
}
return rv
}
// AutomatonIterator returns an iterator which only visits terms
// having the the vellum automaton and start/end key range
func (d *Dictionary) AutomatonIterator(a vellum.Automaton,
startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive)
if err == nil {
rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
}
}
return rv
} }
func (d *Dictionary) OnlyIterator(onlyTerms [][]byte,
includeCount bool) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
omitCount: !includeCount,
}
var buf bytes.Buffer
builder, err := vellum.New(&buf, nil)
if err != nil {
rv.err = err
return rv
}
for _, term := range onlyTerms {
err = builder.Insert(term, 0)
if err != nil {
rv.err = err
return rv
}
}
err = builder.Close()
if err != nil {
rv.err = err
return rv
}
onlyFST, err := vellum.Load(buf.Bytes())
if err != nil {
rv.err = err
return rv
}
itr, err := d.fst.Search(onlyFST, nil, nil)
if err == nil {
rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
} }
return rv return rv
@ -142,24 +231,26 @@ type DictionaryIterator struct {
itr vellum.Iterator itr vellum.Iterator
err error err error
tmp PostingsList tmp PostingsList
entry index.DictEntry
omitCount bool
} }
// Next returns the next entry in the dictionary // Next returns the next entry in the dictionary
func (i *DictionaryIterator) Next() (*index.DictEntry, error) { func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
if i.itr == nil || i.err == vellum.ErrIteratorDone { if i.err != nil && i.err != vellum.ErrIteratorDone {
return nil, nil
} else if i.err != nil {
return nil, i.err return nil, i.err
} else if i.itr == nil || i.err == vellum.ErrIteratorDone {
return nil, nil
} }
term, postingsOffset := i.itr.Current() term, postingsOffset := i.itr.Current()
i.entry.Term = string(term)
if !i.omitCount {
i.err = i.tmp.read(postingsOffset, i.d) i.err = i.tmp.read(postingsOffset, i.d)
if i.err != nil { if i.err != nil {
return nil, i.err return nil, i.err
} }
rv := &index.DictEntry{ i.entry.Count = i.tmp.Count()
Term: string(term),
Count: i.tmp.Count(),
} }
i.err = i.itr.Next() i.err = i.itr.Next()
return rv, nil return &i.entry, nil
} }

@ -19,93 +19,129 @@ import (
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"math" "math"
"reflect"
"sort" "sort"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
"github.com/golang/snappy" "github.com/golang/snappy"
) )
type docValueIterator struct { var reflectStaticSizedocValueReader int
func init() {
var dvi docValueReader
reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size())
}
type docNumTermsVisitor func(docNum uint64, terms []byte) error
type docVisitState struct {
dvrs map[uint16]*docValueReader
segment *Segment
}
type docValueReader struct {
field string field string
curChunkNum uint64 curChunkNum uint64
numChunks uint64 chunkOffsets []uint64
chunkLens []uint64
dvDataLoc uint64 dvDataLoc uint64
curChunkHeader []MetaData curChunkHeader []MetaData
curChunkData []byte // compressed data cache curChunkData []byte // compressed data cache
uncompressed []byte // temp buf for snappy decompression
} }
func (di *docValueIterator) sizeInBytes() uint64 { func (di *docValueReader) size() int {
// curChunkNum, numChunks, dvDataLoc --> uint64 return reflectStaticSizedocValueReader + size.SizeOfPtr +
sizeInBytes := 24 len(di.field) +
len(di.chunkOffsets)*size.SizeOfUint64 +
// field len(di.curChunkHeader)*reflectStaticSizeMetaData +
sizeInBytes += (len(di.field) + int(segment.SizeOfString)) len(di.curChunkData)
}
// chunkLens, curChunkHeader func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader {
sizeInBytes += len(di.chunkLens)*8 + if rv == nil {
len(di.curChunkHeader)*24 + rv = &docValueReader{}
int(segment.SizeOfSlice*2) /* overhead from slices */ }
// curChunkData is mmap'ed, not included rv.field = di.field
rv.curChunkNum = math.MaxUint64
rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable
rv.dvDataLoc = di.dvDataLoc
rv.curChunkHeader = rv.curChunkHeader[:0]
rv.curChunkData = nil
rv.uncompressed = rv.uncompressed[:0]
return uint64(sizeInBytes) return rv
} }
func (di *docValueIterator) fieldName() string { func (di *docValueReader) fieldName() string {
return di.field return di.field
} }
func (di *docValueIterator) curChunkNumber() uint64 { func (di *docValueReader) curChunkNumber() uint64 {
return di.curChunkNum return di.curChunkNum
} }
func (s *SegmentBase) loadFieldDocValueIterator(field string, func (s *SegmentBase) loadFieldDocValueReader(field string,
fieldDvLoc uint64) (*docValueIterator, error) { fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
// get the docValue offset for the given fields // get the docValue offset for the given fields
if fieldDvLoc == fieldNotUninverted { if fieldDvLocStart == fieldNotUninverted {
return nil, fmt.Errorf("loadFieldDocValueIterator: "+ return nil, fmt.Errorf("loadFieldDocValueReader: "+
"no docValues found for field: %s", field) "no docValues found for field: %s", field)
} }
// read the number of chunks, chunk lengths // read the number of chunks, and chunk offsets position
var offset, clen uint64 var numChunks, chunkOffsetsPosition uint64
numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64])
if read <= 0 { if fieldDvLocEnd-fieldDvLocStart > 16 {
return nil, fmt.Errorf("failed to read the field "+ numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd])
"doc values for field %s", field) // read the length of chunk offsets
chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
// acquire position of chunk offsets
chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
} }
offset += uint64(read)
fdvIter := &docValueIterator{ fdvIter := &docValueReader{
curChunkNum: math.MaxUint64, curChunkNum: math.MaxUint64,
field: field, field: field,
chunkLens: make([]uint64, int(numChunks)), chunkOffsets: make([]uint64, int(numChunks)),
} }
// read the chunk offsets
var offset uint64
for i := 0; i < int(numChunks); i++ { for i := 0; i < int(numChunks); i++ {
clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64])
if read <= 0 { if read <= 0 {
return nil, fmt.Errorf("corrupted chunk length during segment load") return nil, fmt.Errorf("corrupted chunk offset during segment load")
} }
fdvIter.chunkLens[i] = clen fdvIter.chunkOffsets[i] = loc
offset += uint64(read) offset += uint64(read)
} }
fdvIter.dvDataLoc = fieldDvLoc + offset // set the data offset
fdvIter.dvDataLoc = fieldDvLocStart
return fdvIter, nil return fdvIter, nil
} }
func (di *docValueIterator) loadDvChunk(chunkNumber, func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
localDocNum uint64, s *SegmentBase) error {
// advance to the chunk where the docValues // advance to the chunk where the docValues
// reside for the given docNum // reside for the given docNum
destChunkDataLoc := di.dvDataLoc destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
for i := 0; i < int(chunkNumber); i++ { start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
destChunkDataLoc += di.chunkLens[i] if start >= end {
di.curChunkHeader = di.curChunkHeader[:0]
di.curChunkData = nil
di.curChunkNum = chunkNumber
di.uncompressed = di.uncompressed[:0]
return nil
} }
curChunkSize := di.chunkLens[chunkNumber] destChunkDataLoc += start
curChunkEnd += end
// read the number of docs reside in the chunk // read the number of docs reside in the chunk
numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
if read <= 0 { if read <= 0 {
@ -114,38 +150,81 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
chunkMetaLoc := destChunkDataLoc + uint64(read) chunkMetaLoc := destChunkDataLoc + uint64(read)
offset := uint64(0) offset := uint64(0)
if cap(di.curChunkHeader) < int(numDocs) {
di.curChunkHeader = make([]MetaData, int(numDocs)) di.curChunkHeader = make([]MetaData, int(numDocs))
} else {
di.curChunkHeader = di.curChunkHeader[:int(numDocs)]
}
for i := 0; i < int(numDocs); i++ { for i := 0; i < int(numDocs); i++ {
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read) offset += uint64(read)
di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read) offset += uint64(read)
} }
compressedDataLoc := chunkMetaLoc + offset compressedDataLoc := chunkMetaLoc + offset
dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc dataLength := curChunkEnd - compressedDataLoc
di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
di.curChunkNum = chunkNumber di.curChunkNum = chunkNumber
di.uncompressed = di.uncompressed[:0]
return nil
}
func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error {
for i := 0; i < len(di.chunkOffsets); i++ {
err := di.loadDvChunk(uint64(i), s)
if err != nil {
return err
}
if di.curChunkData == nil || len(di.curChunkHeader) == 0 {
continue
}
// uncompress the already loaded data
uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
if err != nil {
return err
}
di.uncompressed = uncompressed
start := uint64(0)
for _, entry := range di.curChunkHeader {
err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset])
if err != nil {
return err
}
start = entry.DocDvOffset
}
}
return nil return nil
} }
func (di *docValueIterator) visitDocValues(docNum uint64, func (di *docValueReader) visitDocValues(docNum uint64,
visitor index.DocumentFieldTermVisitor) error { visitor index.DocumentFieldTermVisitor) error {
// binary search the term locations for the docNum // binary search the term locations for the docNum
start, length := di.getDocValueLocs(docNum) start, end := di.getDocValueLocs(docNum)
if start == math.MaxUint64 || length == math.MaxUint64 { if start == math.MaxUint64 || end == math.MaxUint64 || start == end {
return nil return nil
} }
var uncompressed []byte
var err error
// use the uncompressed copy if available
if len(di.uncompressed) > 0 {
uncompressed = di.uncompressed
} else {
// uncompress the already loaded data // uncompress the already loaded data
uncompressed, err := snappy.Decode(nil, di.curChunkData) uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
if err != nil { if err != nil {
return err return err
} }
di.uncompressed = uncompressed
}
// pick the terms for the given docNum // pick the terms for the given docNum
uncompressed = uncompressed[start : start+length] uncompressed = uncompressed[start:end]
for { for {
i := bytes.Index(uncompressed, termSeparatorSplitSlice) i := bytes.Index(uncompressed, termSeparatorSplitSlice)
if i < 0 { if i < 0 {
@ -159,55 +238,72 @@ func (di *docValueIterator) visitDocValues(docNum uint64,
return nil return nil
} }
func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) { func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
i := sort.Search(len(di.curChunkHeader), func(i int) bool { i := sort.Search(len(di.curChunkHeader), func(i int) bool {
return di.curChunkHeader[i].DocNum >= docNum return di.curChunkHeader[i].DocNum >= docNum
}) })
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen return ReadDocValueBoundary(i, di.curChunkHeader)
} }
return math.MaxUint64, math.MaxUint64 return math.MaxUint64, math.MaxUint64
} }
// VisitDocumentFieldTerms is an implementation of the // VisitDocumentFieldTerms is an implementation of the
// DocumentFieldTermVisitable interface // DocumentFieldTermVisitable interface
func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor) error { visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) (
fieldIDPlus1 := uint16(0) segment.DocVisitState, error) {
ok := true dvs, ok := dvsIn.(*docVisitState)
if !ok || dvs == nil {
dvs = &docVisitState{}
} else {
if dvs.segment != s {
dvs.segment = s
dvs.dvrs = nil
}
}
var fieldIDPlus1 uint16
if dvs.dvrs == nil {
dvs.dvrs = make(map[uint16]*docValueReader, len(fields))
for _, field := range fields { for _, field := range fields {
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
continue continue
} }
fieldID := fieldIDPlus1 - 1
if dvIter, exists := s.fieldDvReaders[fieldID]; exists &&
dvIter != nil {
dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID])
}
}
}
// find the chunkNumber where the docValues are stored // find the chunkNumber where the docValues are stored
docInChunk := localDocNum / uint64(s.chunkFactor) docInChunk := localDocNum / uint64(s.chunkFactor)
var dvr *docValueReader
if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists && for _, field := range fields {
dvIter != nil { if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
continue
}
fieldID := fieldIDPlus1 - 1
if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
// check if the chunk is already loaded // check if the chunk is already loaded
if docInChunk != dvIter.curChunkNumber() { if docInChunk != dvr.curChunkNumber() {
err := dvIter.loadDvChunk(docInChunk, localDocNum, s) err := dvr.loadDvChunk(docInChunk, &s.SegmentBase)
if err != nil { if err != nil {
continue return dvs, err
} }
} }
_ = dvIter.visitDocValues(localDocNum, visitor) _ = dvr.visitDocValues(localDocNum, visitor)
} }
} }
return nil return dvs, nil
} }
// VisitableDocValueFields returns the list of fields with // VisitableDocValueFields returns the list of fields with
// persisted doc value terms ready to be visitable using the // persisted doc value terms ready to be visitable using the
// VisitDocumentFieldTerms method. // VisitDocumentFieldTerms method.
func (s *Segment) VisitableDocValueFields() ([]string, error) { func (s *Segment) VisitableDocValueFields() ([]string, error) {
var rv []string return s.fieldDvNames, nil
for fieldID, field := range s.fieldsInv {
if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok &&
dvIter != nil {
rv = append(rv, field)
}
}
return rv, nil
} }

@ -46,26 +46,27 @@ func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) {
for i, itr := range rv.itrs { for i, itr := range rv.itrs {
rv.currKs[i], rv.currVs[i] = itr.Current() rv.currKs[i], rv.currVs[i] = itr.Current()
} }
rv.updateMatches() rv.updateMatches(false)
if rv.lowK == nil { if rv.lowK == nil && len(rv.lowIdxs) == 0 {
return rv, vellum.ErrIteratorDone return rv, vellum.ErrIteratorDone
} }
return rv, nil return rv, nil
} }
// updateMatches maintains the low key matches based on the currKs // updateMatches maintains the low key matches based on the currKs
func (m *enumerator) updateMatches() { func (m *enumerator) updateMatches(skipEmptyKey bool) {
m.lowK = nil m.lowK = nil
m.lowIdxs = m.lowIdxs[:0] m.lowIdxs = m.lowIdxs[:0]
m.lowCurr = 0 m.lowCurr = 0
for i, key := range m.currKs { for i, key := range m.currKs {
if key == nil { if (key == nil && m.currVs[i] == 0) || // in case of empty iterator
(len(key) == 0 && skipEmptyKey) { // skip empty keys
continue continue
} }
cmp := bytes.Compare(key, m.lowK) cmp := bytes.Compare(key, m.lowK)
if cmp < 0 || m.lowK == nil { if cmp < 0 || len(m.lowIdxs) == 0 {
// reached a new low // reached a new low
m.lowK = key m.lowK = key
m.lowIdxs = m.lowIdxs[:0] m.lowIdxs = m.lowIdxs[:0]
@ -102,9 +103,10 @@ func (m *enumerator) Next() error {
} }
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current()
} }
m.updateMatches() // can skip any empty keys encountered at this point
m.updateMatches(true)
} }
if m.lowK == nil { if m.lowK == nil && len(m.lowIdxs) == 0 {
return vellum.ErrIteratorDone return vellum.ErrIteratorDone
} }
return nil return nil

@ -18,16 +18,12 @@ import (
"bytes" "bytes"
"encoding/binary" "encoding/binary"
"io" "io"
"github.com/Smerity/govarint"
) )
type chunkedIntCoder struct { type chunkedIntCoder struct {
final []byte final []byte
maxDocNum uint64
chunkSize uint64 chunkSize uint64
chunkBuf bytes.Buffer chunkBuf bytes.Buffer
encoder *govarint.Base128Encoder
chunkLens []uint64 chunkLens []uint64
currChunk uint64 currChunk uint64
@ -41,11 +37,9 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder {
total := maxDocNum/chunkSize + 1 total := maxDocNum/chunkSize + 1
rv := &chunkedIntCoder{ rv := &chunkedIntCoder{
chunkSize: chunkSize, chunkSize: chunkSize,
maxDocNum: maxDocNum,
chunkLens: make([]uint64, total), chunkLens: make([]uint64, total),
final: make([]byte, 0, 64), final: make([]byte, 0, 64),
} }
rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf)
return rv return rv
} }
@ -67,16 +61,18 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
chunk := docNum / c.chunkSize chunk := docNum / c.chunkSize
if chunk != c.currChunk { if chunk != c.currChunk {
// starting a new chunk // starting a new chunk
if c.encoder != nil {
// close out last
c.Close() c.Close()
c.chunkBuf.Reset() c.chunkBuf.Reset()
}
c.currChunk = chunk c.currChunk = chunk
} }
if len(c.buf) < binary.MaxVarintLen64 {
c.buf = make([]byte, binary.MaxVarintLen64)
}
for _, val := range vals { for _, val := range vals {
_, err := c.encoder.PutU64(val) wb := binary.PutUvarint(c.buf, val)
_, err := c.chunkBuf.Write(c.buf[:wb])
if err != nil { if err != nil {
return err return err
} }
@ -85,13 +81,26 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
return nil return nil
} }
func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error {
chunk := docNum / c.chunkSize
if chunk != c.currChunk {
// starting a new chunk
c.Close()
c.chunkBuf.Reset()
c.currChunk = chunk
}
_, err := c.chunkBuf.Write(buf)
return err
}
// Close indicates you are done calling Add() this allows the final chunk // Close indicates you are done calling Add() this allows the final chunk
// to be encoded. // to be encoded.
func (c *chunkedIntCoder) Close() { func (c *chunkedIntCoder) Close() {
c.encoder.Close()
encodingBytes := c.chunkBuf.Bytes() encodingBytes := c.chunkBuf.Bytes()
c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
c.final = append(c.final, encodingBytes...) c.final = append(c.final, encodingBytes...)
c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close
} }
// Write commits all the encoded chunked integers to the provided writer. // Write commits all the encoded chunked integers to the provided writer.
@ -102,10 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
} }
buf := c.buf buf := c.buf
// write out the number of chunks & each chunkLen // convert the chunk lengths into chunk offsets
n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
for _, chunkLen := range c.chunkLens {
n += binary.PutUvarint(buf[n:], uint64(chunkLen)) // write out the number of chunks & each chunk offsets
n := binary.PutUvarint(buf, uint64(len(chunkOffsets)))
for _, chunkOffset := range chunkOffsets {
n += binary.PutUvarint(buf[n:], chunkOffset)
} }
tw, err := w.Write(buf[:n]) tw, err := w.Write(buf[:n])
@ -121,3 +133,40 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
} }
return tw, nil return tw, nil
} }
func (c *chunkedIntCoder) FinalSize() int {
return len(c.final)
}
// modifyLengthsToEndOffsets converts the chunk length array
// to a chunk offset array. The readChunkBoundary
// will figure out the start and end of every chunk from
// these offsets. Starting offset of i'th index is stored
// in i-1'th position except for 0'th index and ending offset
// is stored at i'th index position.
// For 0'th element, starting position is always zero.
// eg:
// Lens -> 5 5 5 5 => 5 10 15 20
// Lens -> 0 5 0 5 => 0 5 5 10
// Lens -> 0 0 0 5 => 0 0 0 5
// Lens -> 5 0 0 0 => 5 5 5 5
// Lens -> 0 5 0 0 => 0 5 5 5
// Lens -> 0 0 5 0 => 0 0 5 5
func modifyLengthsToEndOffsets(lengths []uint64) []uint64 {
var runningOffset uint64
var index, i int
for i = 1; i <= len(lengths); i++ {
runningOffset += lengths[i-1]
lengths[index] = runningOffset
index++
}
return lengths
}
func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) {
var start uint64
if chunk > 0 {
start = offsets[chunk-1]
}
return start, offsets[chunk]
}

@ -24,11 +24,13 @@ import (
"sort" "sort"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint" seg "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum" "github.com/couchbase/vellum"
"github.com/golang/snappy" "github.com/golang/snappy"
) )
var DefaultFileMergerBufferSize = 1024 * 1024
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
// Merge takes a slice of zap segments and bit masks describing which // Merge takes a slice of zap segments and bit masks describing which
@ -36,12 +38,24 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
// remaining data. This new segment is built at the specified path, // remaining data. This new segment is built at the specified path,
// with the provided chunkFactor. // with the provided chunkFactor.
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
chunkFactor uint32) ([][]uint64, error) { chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) (
[][]uint64, uint64, error) {
segmentBases := make([]*SegmentBase, len(segments))
for segmenti, segment := range segments {
segmentBases[segmenti] = &segment.SegmentBase
}
return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s)
}
func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string,
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) (
[][]uint64, uint64, error) {
flag := os.O_RDWR | os.O_CREATE flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600) f, err := os.OpenFile(path, flag, 0600)
if err != nil { if err != nil {
return nil, err return nil, 0, err
} }
cleanup := func() { cleanup := func() {
@ -49,54 +63,49 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
_ = os.Remove(path) _ = os.Remove(path)
} }
segmentBases := make([]*SegmentBase, len(segments))
for segmenti, segment := range segments {
segmentBases[segmenti] = &segment.SegmentBase
}
// buffer the output // buffer the output
br := bufio.NewWriter(f) br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize)
// wrap it for counting (tracking offsets) // wrap it for counting (tracking offsets)
cr := NewCountHashWriter(br) cr := NewCountHashWriterWithStatsReporter(br, s)
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err :=
MergeToWriter(segmentBases, drops, chunkFactor, cr) MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh)
if err != nil { if err != nil {
cleanup() cleanup()
return nil, err return nil, 0, err
} }
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset,
docValueOffset, chunkFactor, cr.Sum32(), cr) docValueOffset, chunkFactor, cr.Sum32(), cr)
if err != nil { if err != nil {
cleanup() cleanup()
return nil, err return nil, 0, err
} }
err = br.Flush() err = br.Flush()
if err != nil { if err != nil {
cleanup() cleanup()
return nil, err return nil, 0, err
} }
err = f.Sync() err = f.Sync()
if err != nil { if err != nil {
cleanup() cleanup()
return nil, err return nil, 0, err
} }
err = f.Close() err = f.Close()
if err != nil { if err != nil {
cleanup() cleanup()
return nil, err return nil, 0, err
} }
return newDocNums, nil return newDocNums, uint64(cr.Count()), nil
} }
func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
chunkFactor uint32, cr *CountHashWriter) ( chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) (
newDocNums [][]uint64, newDocNums [][]uint64,
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16,
@ -108,15 +117,21 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
fieldsMap = mapFields(fieldsInv) fieldsMap = mapFields(fieldsInv)
numDocs = computeNewDocCount(segments, drops) numDocs = computeNewDocCount(segments, drops)
if isClosed(closeCh) {
return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed
}
if numDocs > 0 { if numDocs > 0 {
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
fieldsMap, fieldsInv, fieldsSame, numDocs, cr) fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh)
if err != nil { if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err return nil, 0, 0, 0, 0, nil, nil, nil, err
} }
dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, dictLocs, docValueOffset, err = persistMergedRest(segments, drops,
newDocNums, numDocs, chunkFactor, cr) fieldsInv, fieldsMap, fieldsSame,
newDocNums, numDocs, chunkFactor, cr, closeCh)
if err != nil { if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err return nil, 0, 0, 0, 0, nil, nil, nil, err
} }
@ -156,11 +171,10 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64
} }
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64, fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
newSegDocCount uint64, chunkFactor uint32, newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32,
w *CountHashWriter) ([]uint64, uint64, error) { w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) {
var bufReuse bytes.Buffer
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
var bufLoc []uint64 var bufLoc []uint64
@ -168,36 +182,38 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
var postItr *PostingsIterator var postItr *PostingsIterator
rv := make([]uint64, len(fieldsInv)) rv := make([]uint64, len(fieldsInv))
fieldDvLocs := make([]uint64, len(fieldsInv)) fieldDvLocsStart := make([]uint64, len(fieldsInv))
fieldDvLocsEnd := make([]uint64, len(fieldsInv))
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
// docTermMap is keyed by docNum, where the array impl provides
// better memory usage behavior than a sparse-friendlier hashmap
// for when docs have much structural similarity (i.e., every doc
// has a given field)
var docTermMap [][]byte
var vellumBuf bytes.Buffer var vellumBuf bytes.Buffer
// for each field
for fieldID, fieldName := range fieldsInv {
if fieldID != 0 {
vellumBuf.Reset()
}
newVellum, err := vellum.New(&vellumBuf, nil) newVellum, err := vellum.New(&vellumBuf, nil)
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
newRoaring := roaring.NewBitmap()
// for each field
for fieldID, fieldName := range fieldsInv {
// collect FST iterators from all active segments for this field // collect FST iterators from all active segments for this field
var newDocNums [][]uint64 var newDocNums [][]uint64
var drops []*roaring.Bitmap var drops []*roaring.Bitmap
var dicts []*Dictionary var dicts []*Dictionary
var itrs []vellum.Iterator var itrs []vellum.Iterator
var segmentsInFocus []*SegmentBase
for segmentI, segment := range segments { for segmentI, segment := range segments {
// check for the closure in meantime
if isClosed(closeCh) {
return nil, 0, seg.ErrClosed
}
dict, err2 := segment.dictionary(fieldName) dict, err2 := segment.dictionary(fieldName)
if err2 != nil { if err2 != nil {
return nil, 0, err2 return nil, 0, err2
@ -209,89 +225,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
} }
if itr != nil { if itr != nil {
newDocNums = append(newDocNums, newDocNumsIn[segmentI]) newDocNums = append(newDocNums, newDocNumsIn[segmentI])
if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() {
drops = append(drops, dropsIn[segmentI]) drops = append(drops, dropsIn[segmentI])
} else {
drops = append(drops, nil)
}
dicts = append(dicts, dict) dicts = append(dicts, dict)
itrs = append(itrs, itr) itrs = append(itrs, itr)
segmentsInFocus = append(segmentsInFocus, segment)
} }
} }
} }
if uint64(cap(docTermMap)) < newSegDocCount {
docTermMap = make([][]byte, newSegDocCount)
} else {
docTermMap = docTermMap[0:newSegDocCount]
for docNum := range docTermMap { // reset the docTermMap
docTermMap[docNum] = docTermMap[docNum][:0]
}
}
var prevTerm []byte var prevTerm []byte
newRoaring := roaring.NewBitmap() newRoaring.Clear()
newRoaringLocs := roaring.NewBitmap()
finishTerm := func(term []byte) error {
if term == nil {
return nil
}
tfEncoder.Close() var lastDocNum, lastFreq, lastNorm uint64
locEncoder.Close()
if newRoaring.GetCardinality() > 0 { // determines whether to use "1-hit" encoding optimization
// this field/term actually has hits in the new segment, lets write it down // when a term appears in only 1 doc, with no loc info,
freqOffset := uint64(w.Count()) // has freq of 1, and the docNum fits into 31-bits
_, err := tfEncoder.Write(w) use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) {
if err != nil { if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 {
return err docNum := uint64(newRoaring.Minimum())
if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 {
return true, docNum, lastNorm
} }
locOffset := uint64(w.Count())
_, err = locEncoder.Write(w)
if err != nil {
return err
} }
postingLocOffset := uint64(w.Count()) return false, 0, 0
_, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64)
if err != nil {
return err
} }
postingOffset := uint64(w.Count())
// write out the start of the term info finishTerm := func(term []byte) error {
n := binary.PutUvarint(bufMaxVarintLen64, freqOffset) tfEncoder.Close()
_, err = w.Write(bufMaxVarintLen64[:n]) locEncoder.Close()
if err != nil {
return err postingsOffset, err := writePostings(newRoaring,
} tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64)
// write out the start of the loc info
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return err
}
// write out the start of the posting locs
n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return err
}
_, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64)
if err != nil { if err != nil {
return err return err
} }
err = newVellum.Insert(term, postingOffset) if postingsOffset > 0 {
err = newVellum.Insert(term, postingsOffset)
if err != nil { if err != nil {
return err return err
} }
} }
newRoaring = roaring.NewBitmap() newRoaring.Clear()
newRoaringLocs = roaring.NewBitmap()
tfEncoder.Reset() tfEncoder.Reset()
locEncoder.Reset() locEncoder.Reset()
lastDocNum = 0
lastFreq = 0
lastNorm = 0
return nil return nil
} }
@ -301,66 +291,39 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
term, itrI, postingsOffset := enumerator.Current() term, itrI, postingsOffset := enumerator.Current()
if !bytes.Equal(prevTerm, term) { if !bytes.Equal(prevTerm, term) {
// if the term changed, write out the info collected // check for the closure in meantime
// for the previous term if isClosed(closeCh) {
err2 := finishTerm(prevTerm) return nil, 0, seg.ErrClosed
if err2 != nil {
return nil, 0, err2
}
} }
var err2 error // if the term changed, write out the info collected
postings, err2 = dicts[itrI].postingsListFromOffset( // for the previous term
postingsOffset, drops[itrI], postings) err = finishTerm(prevTerm)
if err2 != nil {
return nil, 0, err2
}
newDocNumsI := newDocNums[itrI]
postItr = postings.iterator(postItr)
next, err2 := postItr.Next()
for next != nil && err2 == nil {
hitNewDocNum := newDocNumsI[next.Number()]
if hitNewDocNum == docDropped {
return nil, 0, fmt.Errorf("see hit with dropped doc num")
}
newRoaring.Add(uint32(hitNewDocNum))
// encode norm bits
norm := next.Norm()
normBits := math.Float32bits(float32(norm))
err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits))
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
locs := next.Locations()
if len(locs) > 0 {
newRoaringLocs.Add(uint32(hitNewDocNum))
for _, loc := range locs {
if cap(bufLoc) < 5+len(loc.ArrayPositions()) {
bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions()))
} }
args := bufLoc[0:5]
args[0] = uint64(fieldsMap[loc.Field()] - 1) postings, err = dicts[itrI].postingsListFromOffset(
args[1] = loc.Pos() postingsOffset, drops[itrI], postings)
args[2] = loc.Start()
args[3] = loc.End()
args[4] = uint64(len(loc.ArrayPositions()))
args = append(args, loc.ArrayPositions()...)
err = locEncoder.Add(hitNewDocNum, args...)
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
}
}
docTermMap[hitNewDocNum] = postItr = postings.iterator(true, true, true, postItr)
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
next, err2 = postItr.Next() if fieldsSame {
// can optimize by copying freq/norm/loc bytes directly
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
term, postItr, newDocNums[itrI], newRoaring,
tfEncoder, locEncoder)
} else {
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
fieldsMap, term, postItr, newDocNums[itrI], newRoaring,
tfEncoder, locEncoder, bufLoc)
} }
if err2 != nil { if err != nil {
return nil, 0, err2 return nil, 0, err
} }
prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem
@ -368,7 +331,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
err = enumerator.Next() err = enumerator.Next()
} }
if err != nil && err != vellum.ErrIteratorDone { if err != vellum.ErrIteratorDone {
return nil, 0, err return nil, 0, err
} }
@ -400,26 +363,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
rv[fieldID] = dictOffset rv[fieldID] = dictOffset
// get the field doc value offset (start)
fieldDvLocsStart[fieldID] = uint64(w.Count())
// update the field doc values // update the field doc values
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true)
for docNum, docTerms := range docTermMap {
if len(docTerms) > 0 { fdvReadersAvailable := false
err = fdvEncoder.Add(uint64(docNum), docTerms) var dvIterClone *docValueReader
for segmentI, segment := range segmentsInFocus {
// check for the closure in meantime
if isClosed(closeCh) {
return nil, 0, seg.ErrClosed
}
fieldIDPlus1 := uint16(segment.fieldsMap[fieldName])
if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists &&
dvIter != nil {
fdvReadersAvailable = true
dvIterClone = dvIter.cloneInto(dvIterClone)
err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error {
if newDocNums[segmentI][docNum] == docDropped {
return nil
}
err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms)
if err != nil {
return err
}
return nil
})
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
} }
} }
if fdvReadersAvailable {
err = fdvEncoder.Close() err = fdvEncoder.Close()
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
// get the field doc value offset
fieldDvLocs[fieldID] = uint64(w.Count())
// persist the doc value details for this field // persist the doc value details for this field
_, err = fdvEncoder.Write(w) _, err = fdvEncoder.Write()
if err != nil {
return nil, 0, err
}
// get the field doc value offset (end)
fieldDvLocsEnd[fieldID] = uint64(w.Count())
} else {
fieldDvLocsStart[fieldID] = fieldNotUninverted
fieldDvLocsEnd[fieldID] = fieldNotUninverted
}
// reset vellum buffer and vellum builder
vellumBuf.Reset()
err = newVellum.Reset(&vellumBuf)
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
@ -428,38 +428,210 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
fieldDvLocsOffset := uint64(w.Count()) fieldDvLocsOffset := uint64(w.Count())
buf := bufMaxVarintLen64 buf := bufMaxVarintLen64
for _, offset := range fieldDvLocs { for i := 0; i < len(fieldDvLocsStart); i++ {
n := binary.PutUvarint(buf, uint64(offset)) n := binary.PutUvarint(buf, fieldDvLocsStart[i])
_, err := w.Write(buf[:n]) _, err := w.Write(buf[:n])
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
n = binary.PutUvarint(buf, fieldDvLocsEnd[i])
_, err = w.Write(buf[:n])
if err != nil {
return nil, 0, err
}
} }
return rv, fieldDvLocsOffset, nil return rv, fieldDvLocsOffset, nil
} }
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator,
newDocNums []uint64, newRoaring *roaring.Bitmap,
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) (
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) {
next, err := postItr.Next()
for next != nil && err == nil {
hitNewDocNum := newDocNums[next.Number()]
if hitNewDocNum == docDropped {
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum")
}
newRoaring.Add(uint32(hitNewDocNum))
nextFreq := next.Frequency()
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
locs := next.Locations()
err = tfEncoder.Add(hitNewDocNum,
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm)
if err != nil {
return 0, 0, 0, nil, err
}
if len(locs) > 0 {
numBytesLocs := 0
for _, loc := range locs {
ap := loc.ArrayPositions()
numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1),
loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap)
}
err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs))
if err != nil {
return 0, 0, 0, nil, err
}
for _, loc := range locs {
ap := loc.ArrayPositions()
if cap(bufLoc) < 5+len(ap) {
bufLoc = make([]uint64, 0, 5+len(ap))
}
args := bufLoc[0:5]
args[0] = uint64(fieldsMap[loc.Field()] - 1)
args[1] = loc.Pos()
args[2] = loc.Start()
args[3] = loc.End()
args[4] = uint64(len(ap))
args = append(args, ap...)
err = locEncoder.Add(hitNewDocNum, args...)
if err != nil {
return 0, 0, 0, nil, err
}
}
}
lastDocNum = hitNewDocNum
lastFreq = nextFreq
lastNorm = nextNorm
next, err = postItr.Next()
}
return lastDocNum, lastFreq, lastNorm, bufLoc, err
}
func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
newDocNums []uint64, newRoaring *roaring.Bitmap,
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) (
lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) {
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err :=
postItr.nextBytes()
for err == nil && len(nextFreqNormBytes) > 0 {
hitNewDocNum := newDocNums[nextDocNum]
if hitNewDocNum == docDropped {
return 0, 0, 0, fmt.Errorf("see hit with dropped doc num")
}
newRoaring.Add(uint32(hitNewDocNum))
err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes)
if err != nil {
return 0, 0, 0, err
}
if len(nextLocBytes) > 0 {
err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes)
if err != nil {
return 0, 0, 0, err
}
}
lastDocNum = hitNewDocNum
lastFreq = nextFreq
lastNorm = nextNorm
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err =
postItr.nextBytes()
}
return lastDocNum, lastFreq, lastNorm, err
}
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder,
use1HitEncoding func(uint64) (bool, uint64, uint64),
w *CountHashWriter, bufMaxVarintLen64 []byte) (
offset uint64, err error) {
termCardinality := postings.GetCardinality()
if termCardinality <= 0 {
return 0, nil
}
if use1HitEncoding != nil {
encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality)
if encodeAs1Hit {
return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil
}
}
tfOffset := uint64(w.Count())
_, err = tfEncoder.Write(w)
if err != nil {
return 0, err
}
locOffset := uint64(w.Count())
_, err = locEncoder.Write(w)
if err != nil {
return 0, err
}
postingsOffset := uint64(w.Count())
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return 0, err
}
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return 0, err
}
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64)
if err != nil {
return 0, err
}
return postingsOffset, nil
}
type varintEncoder func(uint64) (int, error)
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
w *CountHashWriter) (uint64, [][]uint64, error) { w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) {
var rv [][]uint64 // The remapped or newDocNums for each segment. var rv [][]uint64 // The remapped or newDocNums for each segment.
var newDocNum uint64 var newDocNum uint64
var curr int var curr int
var metaBuf bytes.Buffer
var data, compressed []byte var data, compressed []byte
var metaBuf bytes.Buffer
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) varBuf := make([]byte, binary.MaxVarintLen64)
metaEncode := func(val uint64) (int, error) {
wb := binary.PutUvarint(varBuf, val)
return metaBuf.Write(varBuf[:wb])
}
vals := make([][][]byte, len(fieldsInv)) vals := make([][][]byte, len(fieldsInv))
typs := make([][]byte, len(fieldsInv)) typs := make([][]byte, len(fieldsInv))
poss := make([][][]uint64, len(fieldsInv)) poss := make([][][]uint64, len(fieldsInv))
var posBuf []uint64
docNumOffsets := make([]uint64, newSegDocCount) docNumOffsets := make([]uint64, newSegDocCount)
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
defer visitDocumentCtxPool.Put(vdc)
// for each segment // for each segment
for segI, segment := range segments { for segI, segment := range segments {
// check for the closure in meantime
if isClosed(closeCh) {
return 0, nil, seg.ErrClosed
}
segNewDocNums := make([]uint64, segment.numDocs) segNewDocNums := make([]uint64, segment.numDocs)
dropsI := drops[segI] dropsI := drops[segI]
@ -495,7 +667,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
curr = 0 curr = 0
metaBuf.Reset() metaBuf.Reset()
data = data[:0] data = data[:0]
compressed = compressed[:0]
posTemp := posBuf
// collect all the data // collect all the data
for i := 0; i < len(fieldsInv); i++ { for i := 0; i < len(fieldsInv); i++ {
@ -503,42 +676,63 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
typs[i] = typs[i][:0] typs[i] = typs[i][:0]
poss[i] = poss[i][:0] poss[i] = poss[i][:0]
} }
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
fieldID := int(fieldsMap[field]) - 1 fieldID := int(fieldsMap[field]) - 1
vals[fieldID] = append(vals[fieldID], value) vals[fieldID] = append(vals[fieldID], value)
typs[fieldID] = append(typs[fieldID], typ) typs[fieldID] = append(typs[fieldID], typ)
poss[fieldID] = append(poss[fieldID], pos)
// copy array positions to preserve them beyond the scope of this callback
var curPos []uint64
if len(pos) > 0 {
if cap(posTemp) < len(pos) {
posBuf = make([]uint64, len(pos)*len(fieldsInv))
posTemp = posBuf
}
curPos = posTemp[0:len(pos)]
copy(curPos, pos)
posTemp = posTemp[len(pos):]
}
poss[fieldID] = append(poss[fieldID], curPos)
return true return true
}) })
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
// now walk the fields in order // _id field special case optimizes ExternalID() lookups
for fieldID := range fieldsInv { idFieldVal := vals[uint16(0)][0]
storedFieldValues := vals[int(fieldID)] _, err = metaEncode(uint64(len(idFieldVal)))
if err != nil {
return 0, nil, err
}
// now walk the non-"_id" fields in order
for fieldID := 1; fieldID < len(fieldsInv); fieldID++ {
storedFieldValues := vals[fieldID]
stf := typs[int(fieldID)] stf := typs[fieldID]
spf := poss[int(fieldID)] spf := poss[fieldID]
var err2 error var err2 error
curr, data, err2 = persistStoredFieldValues(fieldID, curr, data, err2 = persistStoredFieldValues(fieldID,
storedFieldValues, stf, spf, curr, metaEncoder, data) storedFieldValues, stf, spf, curr, metaEncode, data)
if err2 != nil { if err2 != nil {
return 0, nil, err2 return 0, nil, err2
} }
} }
metaEncoder.Close()
metaBytes := metaBuf.Bytes() metaBytes := metaBuf.Bytes()
compressed = snappy.Encode(compressed, data) compressed = snappy.Encode(compressed[:cap(compressed)], data)
// record where we're about to start writing // record where we're about to start writing
docNumOffsets[newDocNum] = uint64(w.Count()) docNumOffsets[newDocNum] = uint64(w.Count())
// write out the meta len and compressed data len // write out the meta len and compressed data len
_, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) _, err = writeUvarints(w,
uint64(len(metaBytes)),
uint64(len(idFieldVal)+len(compressed)))
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
@ -547,6 +741,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
if err != nil { if err != nil {
return 0, nil, err return 0, nil, err
} }
// now write the _id field val (counted as part of the 'compressed' data)
_, err = w.Write(idFieldVal)
if err != nil {
return 0, nil, err
}
// now write the compressed data // now write the compressed data
_, err = w.Write(compressed) _, err = w.Write(compressed)
if err != nil { if err != nil {
@ -644,3 +843,12 @@ func mergeFields(segments []*SegmentBase) (bool, []string) {
return fieldsSame, rv return fieldsSame, rv
} }
func isClosed(closeCh chan struct{}) bool {
select {
case <-closeCh:
return true
default:
return false
}
}

@ -0,0 +1,826 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"math"
"sort"
"sync"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/couchbase/vellum"
"github.com/golang/snappy"
)
var NewSegmentBufferNumResultsBump int = 100
var NewSegmentBufferNumResultsFactor float64 = 1.0
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
// SegmentBase from analysis results
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
chunkFactor uint32) (*SegmentBase, uint64, error) {
s := interimPool.Get().(*interim)
var br bytes.Buffer
if s.lastNumDocs > 0 {
// use previous results to initialize the buf with an estimate
// size, but note that the interim instance comes from a
// global interimPool, so multiple scorch instances indexing
// different docs can lead to low quality estimates
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
NewSegmentBufferNumResultsFactor)
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
NewSegmentBufferAvgBytesPerDocFactor)
br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
}
s.results = results
s.chunkFactor = chunkFactor
s.w = NewCountHashWriter(&br)
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
err := s.convert()
if err != nil {
return nil, uint64(0), err
}
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor,
s.FieldsMap, s.FieldsInv, uint64(len(results)),
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
if err == nil && s.reset() == nil {
s.lastNumDocs = len(results)
s.lastOutSize = len(br.Bytes())
interimPool.Put(s)
}
return sb, uint64(len(br.Bytes())), err
}
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
// interim holds temporary working data used while converting from
// analysis results to a zap-encoded segment
type interim struct {
results []*index.AnalysisResult
chunkFactor uint32
w *CountHashWriter
// FieldsMap adds 1 to field id to avoid zero value issues
// name -> field id + 1
FieldsMap map[string]uint16
// FieldsInv is the inverse of FieldsMap
// field id -> name
FieldsInv []string
// Term dictionaries for each field
// field id -> term -> postings list id + 1
Dicts []map[string]uint64
// Terms for each field, where terms are sorted ascending
// field id -> []term
DictKeys [][]string
// Fields whose IncludeDocValues is true
// field id -> bool
IncludeDocValues []bool
// postings id -> bitmap of docNums
Postings []*roaring.Bitmap
// postings id -> freq/norm's, one for each docNum in postings
FreqNorms [][]interimFreqNorm
freqNormsBacking []interimFreqNorm
// postings id -> locs, one for each freq
Locs [][]interimLoc
locsBacking []interimLoc
numTermsPerPostingsList []int // key is postings list id
numLocsPerPostingsList []int // key is postings list id
builder *vellum.Builder
builderBuf bytes.Buffer
metaBuf bytes.Buffer
tmp0 []byte
tmp1 []byte
lastNumDocs int
lastOutSize int
}
func (s *interim) reset() (err error) {
s.results = nil
s.chunkFactor = 0
s.w = nil
s.FieldsMap = nil
s.FieldsInv = nil
for i := range s.Dicts {
s.Dicts[i] = nil
}
s.Dicts = s.Dicts[:0]
for i := range s.DictKeys {
s.DictKeys[i] = s.DictKeys[i][:0]
}
s.DictKeys = s.DictKeys[:0]
for i := range s.IncludeDocValues {
s.IncludeDocValues[i] = false
}
s.IncludeDocValues = s.IncludeDocValues[:0]
for _, idn := range s.Postings {
idn.Clear()
}
s.Postings = s.Postings[:0]
s.FreqNorms = s.FreqNorms[:0]
for i := range s.freqNormsBacking {
s.freqNormsBacking[i] = interimFreqNorm{}
}
s.freqNormsBacking = s.freqNormsBacking[:0]
s.Locs = s.Locs[:0]
for i := range s.locsBacking {
s.locsBacking[i] = interimLoc{}
}
s.locsBacking = s.locsBacking[:0]
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
s.builderBuf.Reset()
if s.builder != nil {
err = s.builder.Reset(&s.builderBuf)
}
s.metaBuf.Reset()
s.tmp0 = s.tmp0[:0]
s.tmp1 = s.tmp1[:0]
s.lastNumDocs = 0
s.lastOutSize = 0
return err
}
func (s *interim) grabBuf(size int) []byte {
buf := s.tmp0
if cap(buf) < size {
buf = make([]byte, size)
s.tmp0 = buf
}
return buf[0:size]
}
type interimStoredField struct {
vals [][]byte
typs []byte
arrayposs [][]uint64 // array positions
}
type interimFreqNorm struct {
freq uint64
norm float32
numLocs int
}
type interimLoc struct {
fieldID uint16
pos uint64
start uint64
end uint64
arrayposs []uint64
}
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
s.FieldsMap = map[string]uint16{}
s.getOrDefineField("_id") // _id field is fieldID 0
for _, result := range s.results {
for _, field := range result.Document.CompositeFields {
s.getOrDefineField(field.Name())
}
for _, field := range result.Document.Fields {
s.getOrDefineField(field.Name())
}
}
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
for fieldID, fieldName := range s.FieldsInv {
s.FieldsMap[fieldName] = uint16(fieldID + 1)
}
if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
} else {
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
}
s.prepareDicts()
for _, dict := range s.DictKeys {
sort.Strings(dict)
}
s.processDocuments()
storedIndexOffset, err := s.writeStoredFields()
if err != nil {
return 0, 0, 0, nil, err
}
var fdvIndexOffset uint64
var dictOffsets []uint64
if len(s.results) > 0 {
fdvIndexOffset, dictOffsets, err = s.writeDicts()
if err != nil {
return 0, 0, 0, nil, err
}
} else {
dictOffsets = make([]uint64, len(s.FieldsInv))
}
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
if err != nil {
return 0, 0, 0, nil, err
}
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
}
func (s *interim) getOrDefineField(fieldName string) int {
fieldIDPlus1, exists := s.FieldsMap[fieldName]
if !exists {
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
s.FieldsMap[fieldName] = fieldIDPlus1
s.FieldsInv = append(s.FieldsInv, fieldName)
s.Dicts = append(s.Dicts, make(map[string]uint64))
n := len(s.DictKeys)
if n < cap(s.DictKeys) {
s.DictKeys = s.DictKeys[:n+1]
s.DictKeys[n] = s.DictKeys[n][:0]
} else {
s.DictKeys = append(s.DictKeys, []string(nil))
}
}
return int(fieldIDPlus1 - 1)
}
// fill Dicts and DictKeys from analysis results
func (s *interim) prepareDicts() {
var pidNext int
var totTFs int
var totLocs int
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
dict := s.Dicts[fieldID]
dictKeys := s.DictKeys[fieldID]
for term, tf := range tfs {
pidPlus1, exists := dict[term]
if !exists {
pidNext++
pidPlus1 = uint64(pidNext)
dict[term] = pidPlus1
dictKeys = append(dictKeys, term)
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
}
pid := pidPlus1 - 1
s.numTermsPerPostingsList[pid] += 1
s.numLocsPerPostingsList[pid] += len(tf.Locations)
totLocs += len(tf.Locations)
}
totTFs += len(tfs)
s.DictKeys[fieldID] = dictKeys
}
for _, result := range s.results {
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name()))
_, tf := field.Analyze()
visitField(fieldID, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
tf := result.Analyzed[i]
visitField(fieldID, tf)
}
}
numPostingsLists := pidNext
if cap(s.Postings) >= numPostingsLists {
s.Postings = s.Postings[:numPostingsLists]
} else {
postings := make([]*roaring.Bitmap, numPostingsLists)
copy(postings, s.Postings[:cap(s.Postings)])
for i := 0; i < numPostingsLists; i++ {
if postings[i] == nil {
postings[i] = roaring.New()
}
}
s.Postings = postings
}
if cap(s.FreqNorms) >= numPostingsLists {
s.FreqNorms = s.FreqNorms[:numPostingsLists]
} else {
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
}
if cap(s.freqNormsBacking) >= totTFs {
s.freqNormsBacking = s.freqNormsBacking[:totTFs]
} else {
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
}
freqNormsBacking := s.freqNormsBacking
for pid, numTerms := range s.numTermsPerPostingsList {
s.FreqNorms[pid] = freqNormsBacking[0:0]
freqNormsBacking = freqNormsBacking[numTerms:]
}
if cap(s.Locs) >= numPostingsLists {
s.Locs = s.Locs[:numPostingsLists]
} else {
s.Locs = make([][]interimLoc, numPostingsLists)
}
if cap(s.locsBacking) >= totLocs {
s.locsBacking = s.locsBacking[:totLocs]
} else {
s.locsBacking = make([]interimLoc, totLocs)
}
locsBacking := s.locsBacking
for pid, numLocs := range s.numLocsPerPostingsList {
s.Locs[pid] = locsBacking[0:0]
locsBacking = locsBacking[numLocs:]
}
}
func (s *interim) processDocuments() {
numFields := len(s.FieldsInv)
reuseFieldLens := make([]int, numFields)
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
for docNum, result := range s.results {
for i := 0; i < numFields; i++ { // clear these for reuse
reuseFieldLens[i] = 0
reuseFieldTFs[i] = nil
}
s.processDocument(uint64(docNum), result,
reuseFieldLens, reuseFieldTFs)
}
}
func (s *interim) processDocument(docNum uint64,
result *index.AnalysisResult,
fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
visitField := func(fieldID uint16, fieldName string,
ln int, tf analysis.TokenFrequencies) {
fieldLens[fieldID] += ln
existingFreqs := fieldTFs[fieldID]
if existingFreqs != nil {
existingFreqs.MergeAll(fieldName, tf)
} else {
fieldTFs[fieldID] = tf
}
}
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name()))
ln, tf := field.Analyze()
visitField(fieldID, field.Name(), ln, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
ln := result.Length[i]
tf := result.Analyzed[i]
visitField(fieldID, field.Name(), ln, tf)
}
// now that it's been rolled up into fieldTFs, walk that
for fieldID, tfs := range fieldTFs {
dict := s.Dicts[fieldID]
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
for term, tf := range tfs {
pid := dict[term] - 1
bs := s.Postings[pid]
bs.Add(uint32(docNum))
s.FreqNorms[pid] = append(s.FreqNorms[pid],
interimFreqNorm{
freq: uint64(tf.Frequency()),
norm: norm,
numLocs: len(tf.Locations),
})
if len(tf.Locations) > 0 {
locs := s.Locs[pid]
for _, loc := range tf.Locations {
var locf = uint16(fieldID)
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field))
}
var arrayposs []uint64
if len(loc.ArrayPositions) > 0 {
arrayposs = loc.ArrayPositions
}
locs = append(locs, interimLoc{
fieldID: locf,
pos: uint64(loc.Position),
start: uint64(loc.Start),
end: uint64(loc.End),
arrayposs: arrayposs,
})
}
s.Locs[pid] = locs
}
}
}
}
func (s *interim) writeStoredFields() (
storedIndexOffset uint64, err error) {
varBuf := make([]byte, binary.MaxVarintLen64)
metaEncode := func(val uint64) (int, error) {
wb := binary.PutUvarint(varBuf, val)
return s.metaBuf.Write(varBuf[:wb])
}
data, compressed := s.tmp0[:0], s.tmp1[:0]
defer func() { s.tmp0, s.tmp1 = data, compressed }()
// keyed by docNum
docStoredOffsets := make([]uint64, len(s.results))
// keyed by fieldID, for the current doc in the loop
docStoredFields := map[uint16]interimStoredField{}
for docNum, result := range s.results {
for fieldID := range docStoredFields { // reset for next doc
delete(docStoredFields, fieldID)
}
for _, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
opts := field.Options()
if opts.IsStored() {
isf := docStoredFields[fieldID]
isf.vals = append(isf.vals, field.Value())
isf.typs = append(isf.typs, encodeFieldType(field))
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
docStoredFields[fieldID] = isf
}
if opts.IncludeDocValues() {
s.IncludeDocValues[fieldID] = true
}
}
var curr int
s.metaBuf.Reset()
data = data[:0]
// _id field special case optimizes ExternalID() lookups
idFieldVal := docStoredFields[uint16(0)].vals[0]
_, err = metaEncode(uint64(len(idFieldVal)))
if err != nil {
return 0, err
}
// handle non-"_id" fields
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
isf, exists := docStoredFields[uint16(fieldID)]
if exists {
curr, data, err = persistStoredFieldValues(
fieldID, isf.vals, isf.typs, isf.arrayposs,
curr, metaEncode, data)
if err != nil {
return 0, err
}
}
}
metaBytes := s.metaBuf.Bytes()
compressed = snappy.Encode(compressed[:cap(compressed)], data)
docStoredOffsets[docNum] = uint64(s.w.Count())
_, err := writeUvarints(s.w,
uint64(len(metaBytes)),
uint64(len(idFieldVal)+len(compressed)))
if err != nil {
return 0, err
}
_, err = s.w.Write(metaBytes)
if err != nil {
return 0, err
}
_, err = s.w.Write(idFieldVal)
if err != nil {
return 0, err
}
_, err = s.w.Write(compressed)
if err != nil {
return 0, err
}
}
storedIndexOffset = uint64(s.w.Count())
for _, docStoredOffset := range docStoredOffsets {
err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
if err != nil {
return 0, err
}
}
return storedIndexOffset, nil
}
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
dictOffsets = make([]uint64, len(s.FieldsInv))
fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
buf := s.grabBuf(binary.MaxVarintLen64)
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false)
var docTermMap [][]byte
if s.builder == nil {
s.builder, err = vellum.New(&s.builderBuf, nil)
if err != nil {
return 0, nil, err
}
}
for fieldID, terms := range s.DictKeys {
if cap(docTermMap) < len(s.results) {
docTermMap = make([][]byte, len(s.results))
} else {
docTermMap = docTermMap[0:len(s.results)]
for docNum := range docTermMap { // reset the docTermMap
docTermMap[docNum] = docTermMap[docNum][:0]
}
}
dict := s.Dicts[fieldID]
for _, term := range terms { // terms are already sorted
pid := dict[term] - 1
postingsBS := s.Postings[pid]
freqNorms := s.FreqNorms[pid]
freqNormOffset := 0
locs := s.Locs[pid]
locOffset := 0
postingsItr := postingsBS.Iterator()
for postingsItr.HasNext() {
docNum := uint64(postingsItr.Next())
freqNorm := freqNorms[freqNormOffset]
err = tfEncoder.Add(docNum,
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
uint64(math.Float32bits(freqNorm.norm)))
if err != nil {
return 0, nil, err
}
if freqNorm.numLocs > 0 {
numBytesLocs := 0
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
numBytesLocs += totalUvarintBytes(
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
uint64(len(loc.arrayposs)), loc.arrayposs)
}
err = locEncoder.Add(docNum, uint64(numBytesLocs))
if err != nil {
return 0, nil, err
}
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
err = locEncoder.Add(docNum,
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
uint64(len(loc.arrayposs)))
if err != nil {
return 0, nil, err
}
err = locEncoder.Add(docNum, loc.arrayposs...)
if err != nil {
return 0, nil, err
}
}
locOffset += freqNorm.numLocs
}
freqNormOffset++
docTermMap[docNum] = append(
append(docTermMap[docNum], term...),
termSeparator)
}
tfEncoder.Close()
locEncoder.Close()
postingsOffset, err :=
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
if err != nil {
return 0, nil, err
}
if postingsOffset > uint64(0) {
err = s.builder.Insert([]byte(term), postingsOffset)
if err != nil {
return 0, nil, err
}
}
tfEncoder.Reset()
locEncoder.Reset()
}
err = s.builder.Close()
if err != nil {
return 0, nil, err
}
// record where this dictionary starts
dictOffsets[fieldID] = uint64(s.w.Count())
vellumData := s.builderBuf.Bytes()
// write out the length of the vellum data
n := binary.PutUvarint(buf, uint64(len(vellumData)))
_, err = s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
// write this vellum to disk
_, err = s.w.Write(vellumData)
if err != nil {
return 0, nil, err
}
// reset vellum for reuse
s.builderBuf.Reset()
err = s.builder.Reset(&s.builderBuf)
if err != nil {
return 0, nil, err
}
// write the field doc values
if s.IncludeDocValues[fieldID] {
for docNum, docTerms := range docTermMap {
if len(docTerms) > 0 {
err = fdvEncoder.Add(uint64(docNum), docTerms)
if err != nil {
return 0, nil, err
}
}
}
err = fdvEncoder.Close()
if err != nil {
return 0, nil, err
}
fdvOffsetsStart[fieldID] = uint64(s.w.Count())
_, err = fdvEncoder.Write()
if err != nil {
return 0, nil, err
}
fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
fdvEncoder.Reset()
} else {
fdvOffsetsStart[fieldID] = fieldNotUninverted
fdvOffsetsEnd[fieldID] = fieldNotUninverted
}
}
fdvIndexOffset = uint64(s.w.Count())
for i := 0; i < len(fdvOffsetsStart); i++ {
n := binary.PutUvarint(buf, fdvOffsetsStart[i])
_, err := s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
_, err = s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
}
return fdvIndexOffset, dictOffsets, nil
}
func encodeFieldType(f document.Field) byte {
fieldType := byte('x')
switch f.(type) {
case *document.TextField:
fieldType = 't'
case *document.NumericField:
fieldType = 'n'
case *document.DateTimeField:
fieldType = 'd'
case *document.BooleanField:
fieldType = 'b'
case *document.GeoPointField:
fieldType = 'g'
case *document.CompositeField:
fieldType = 'c'
}
return fieldType
}
// returns the total # of bytes needed to encode the given uint64's
// into binary.PutUVarint() encoding
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
n = numUvarintBytes(a)
n += numUvarintBytes(b)
n += numUvarintBytes(c)
n += numUvarintBytes(d)
n += numUvarintBytes(e)
for _, v := range more {
n += numUvarintBytes(v)
}
return n
}
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
func numUvarintBytes(x uint64) (n int) {
for x >= 0x80 {
x >>= 7
n++
}
return n + 1
}

@ -18,71 +18,245 @@ import (
"bytes" "bytes"
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"io"
"math" "math"
"reflect"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
) )
// PostingsList is an in-memory represenation of a postings list var reflectStaticSizePostingsList int
var reflectStaticSizePostingsIterator int
var reflectStaticSizePosting int
var reflectStaticSizeLocation int
func init() {
var pl PostingsList
reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size())
var pi PostingsIterator
reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size())
var p Posting
reflectStaticSizePosting = int(reflect.TypeOf(p).Size())
var l Location
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
}
// FST or vellum value (uint64) encoding is determined by the top two
// highest-order or most significant bits...
//
// encoding : MSB
// name : 63 62 61...to...bit #0 (LSB)
// ----------+---+---+---------------------------------------------------
// general : 0 | 0 | 62-bits of postingsOffset.
// ~ : 0 | 1 | reserved for future.
// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum.
// ~ : 1 | 1 | reserved for future.
//
// Encoding "general" is able to handle all cases, where the
// postingsOffset points to more information about the postings for
// the term.
//
// Encoding "1-hit" is used to optimize a commonly seen case when a
// term has only a single hit. For example, a term in the _id field
// will have only 1 hit. The "1-hit" encoding is used for a term
// in a field when...
//
// - term vector info is disabled for that field;
// - and, the term appears in only a single doc for that field;
// - and, the term's freq is exactly 1 in that single doc for that field;
// - and, the docNum must fit into 31-bits;
//
// Otherwise, the "general" encoding is used instead.
//
// In the "1-hit" encoding, the field in that single doc may have
// other terms, which is supported in the "1-hit" encoding by the
// positive float31 norm.
const FSTValEncodingMask = uint64(0xc000000000000000)
const FSTValEncodingGeneral = uint64(0x0000000000000000)
const FSTValEncoding1Hit = uint64(0x8000000000000000)
func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 {
return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum)
}
func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) {
return (mask31Bits & v), (mask31Bits & (v >> 31))
}
const mask31Bits = uint64(0x000000007fffffff)
func under32Bits(x uint64) bool {
return x <= mask31Bits
}
const DocNum1HitFinished = math.MaxUint64
var NormBits1Hit = uint64(math.Float32bits(float32(1)))
// PostingsList is an in-memory representation of a postings list
type PostingsList struct { type PostingsList struct {
sb *SegmentBase sb *SegmentBase
postingsOffset uint64 postingsOffset uint64
freqOffset uint64 freqOffset uint64
locOffset uint64 locOffset uint64
locBitmap *roaring.Bitmap
postings *roaring.Bitmap postings *roaring.Bitmap
except *roaring.Bitmap except *roaring.Bitmap
// when normBits1Hit != 0, then this postings list came from a
// 1-hit encoding, and only the docNum1Hit & normBits1Hit apply
docNum1Hit uint64
normBits1Hit uint64
}
// represents an immutable, empty postings list
var emptyPostingsList = &PostingsList{}
func (p *PostingsList) Size() int {
sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr
if p.except != nil {
sizeInBytes += int(p.except.GetSizeInBytes())
}
return sizeInBytes
}
func (p *PostingsList) OrInto(receiver *roaring.Bitmap) {
if p.normBits1Hit != 0 {
receiver.Add(uint32(p.docNum1Hit))
return
}
if p.postings != nil {
receiver.Or(p.postings)
}
} }
// Iterator returns an iterator for this postings list // Iterator returns an iterator for this postings list
func (p *PostingsList) Iterator() segment.PostingsIterator { func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool,
return p.iterator(nil) prealloc segment.PostingsIterator) segment.PostingsIterator {
if p.normBits1Hit == 0 && p.postings == nil {
return emptyPostingsIterator
}
var preallocPI *PostingsIterator
pi, ok := prealloc.(*PostingsIterator)
if ok && pi != nil {
preallocPI = pi
}
if preallocPI == emptyPostingsIterator {
preallocPI = nil
}
return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI)
} }
func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool,
rv *PostingsIterator) *PostingsIterator {
if rv == nil { if rv == nil {
rv = &PostingsIterator{} rv = &PostingsIterator{}
} else { } else {
freqNormReader := rv.freqNormReader
if freqNormReader != nil {
freqNormReader.Reset([]byte(nil))
}
locReader := rv.locReader
if locReader != nil {
locReader.Reset([]byte(nil))
}
freqChunkOffsets := rv.freqChunkOffsets[:0]
locChunkOffsets := rv.locChunkOffsets[:0]
nextLocs := rv.nextLocs[:0]
nextSegmentLocs := rv.nextSegmentLocs[:0]
buf := rv.buf
*rv = PostingsIterator{} // clear the struct *rv = PostingsIterator{} // clear the struct
rv.freqNormReader = freqNormReader
rv.locReader = locReader
rv.freqChunkOffsets = freqChunkOffsets
rv.locChunkOffsets = locChunkOffsets
rv.nextLocs = nextLocs
rv.nextSegmentLocs = nextSegmentLocs
rv.buf = buf
} }
rv.postings = p rv.postings = p
rv.includeFreqNorm = includeFreq || includeNorm
rv.includeLocs = includeLocs
if p.normBits1Hit != 0 {
// "1-hit" encoding
rv.docNum1Hit = p.docNum1Hit
rv.normBits1Hit = p.normBits1Hit
if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) {
rv.docNum1Hit = DocNum1HitFinished
}
return rv
}
// "general" encoding, check if empty
if p.postings == nil {
return rv
}
if p.postings != nil {
// prepare the freq chunk details
var n uint64 var n uint64
var read int var read int
// prepare the freq chunk details
if rv.includeFreqNorm {
var numFreqChunks uint64 var numFreqChunks uint64
numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
rv.freqChunkLens = make([]uint64, int(numFreqChunks)) if cap(rv.freqChunkOffsets) >= int(numFreqChunks) {
rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)]
} else {
rv.freqChunkOffsets = make([]uint64, int(numFreqChunks))
}
for i := 0; i < int(numFreqChunks); i++ { for i := 0; i < int(numFreqChunks); i++ {
rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
} }
rv.freqChunkStart = p.freqOffset + n rv.freqChunkStart = p.freqOffset + n
}
// prepare the loc chunk details // prepare the loc chunk details
if rv.includeLocs {
n = 0 n = 0
var numLocChunks uint64 var numLocChunks uint64
numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
rv.locChunkLens = make([]uint64, int(numLocChunks)) if cap(rv.locChunkOffsets) >= int(numLocChunks) {
rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)]
} else {
rv.locChunkOffsets = make([]uint64, int(numLocChunks))
}
for i := 0; i < int(numLocChunks); i++ { for i := 0; i < int(numLocChunks); i++ {
rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
} }
rv.locChunkStart = p.locOffset + n rv.locChunkStart = p.locOffset + n
rv.locBitmap = p.locBitmap }
rv.all = p.postings.Iterator() rv.all = p.postings.Iterator()
if p.except != nil { if p.except != nil {
allExcept := roaring.AndNot(p.postings, p.except) rv.ActualBM = roaring.AndNot(p.postings, p.except)
rv.actual = allExcept.Iterator() rv.Actual = rv.ActualBM.Iterator()
} else { } else {
rv.actual = p.postings.Iterator() rv.ActualBM = p.postings
} rv.Actual = rv.all // Optimize to use same iterator for all & Actual.
} }
return rv return rv
@ -90,23 +264,30 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
// Count returns the number of items on this postings list // Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 { func (p *PostingsList) Count() uint64 {
if p.postings != nil { var n uint64
n := p.postings.GetCardinality() if p.normBits1Hit != 0 {
if p.except != nil { n = 1
e := p.except.GetCardinality() } else if p.postings != nil {
if e > n { n = p.postings.GetCardinality()
e = n
}
return n - e
} }
return n var e uint64
if p.except != nil {
e = p.except.GetCardinality()
} }
if n <= e {
return 0 return 0
} }
return n - e
}
func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
rv.postingsOffset = postingsOffset rv.postingsOffset = postingsOffset
// handle "1-hit" encoding special case
if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit {
return rv.init1Hit(postingsOffset)
}
// read the location of the freq/norm details // read the location of the freq/norm details
var n uint64 var n uint64
var read int var read int
@ -117,29 +298,16 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
var locBitmapOffset uint64
locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
var locBitmapLen uint64
locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64])
locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen]
rv.locBitmap = roaring.NewBitmap()
_, err := rv.locBitmap.FromBuffer(locRoaringBytes)
if err != nil {
return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err)
}
var postingsLen uint64 var postingsLen uint64
postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
if rv.postings == nil {
rv.postings = roaring.NewBitmap() rv.postings = roaring.NewBitmap()
_, err = rv.postings.FromBuffer(roaringBytes) }
_, err := rv.postings.FromBuffer(roaringBytes)
if err != nil { if err != nil {
return fmt.Errorf("error loading roaring bitmap: %v", err) return fmt.Errorf("error loading roaring bitmap: %v", err)
} }
@ -147,65 +315,137 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
return nil return nil
} }
func (rv *PostingsList) init1Hit(fstVal uint64) error {
docNum, normBits := FSTValDecode1Hit(fstVal)
rv.docNum1Hit = docNum
rv.normBits1Hit = normBits
return nil
}
// PostingsIterator provides a way to iterate through the postings list // PostingsIterator provides a way to iterate through the postings list
type PostingsIterator struct { type PostingsIterator struct {
postings *PostingsList postings *PostingsList
all roaring.IntIterable all roaring.IntIterable
offset int Actual roaring.IntIterable
locoffset int ActualBM *roaring.Bitmap
actual roaring.IntIterable
currChunk uint32 currChunk uint32
currChunkFreqNorm []byte currChunkFreqNorm []byte
currChunkLoc []byte currChunkLoc []byte
freqNormDecoder *govarint.Base128Decoder
locDecoder *govarint.Base128Decoder
freqChunkLens []uint64 freqNormReader *bytes.Reader
locReader *bytes.Reader
freqChunkOffsets []uint64
freqChunkStart uint64 freqChunkStart uint64
locChunkLens []uint64 locChunkOffsets []uint64
locChunkStart uint64 locChunkStart uint64
locBitmap *roaring.Bitmap next Posting // reused across Next() calls
nextLocs []Location // reused across Next() calls
nextSegmentLocs []segment.Location // reused across Next() calls
docNum1Hit uint64
normBits1Hit uint64
next Posting buf []byte
includeFreqNorm bool
includeLocs bool
} }
func (i *PostingsIterator) loadChunk(chunk int) error { var emptyPostingsIterator = &PostingsIterator{}
if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) {
return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens)) func (i *PostingsIterator) Size() int {
sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr +
len(i.currChunkFreqNorm) +
len(i.currChunkLoc) +
len(i.freqChunkOffsets)*size.SizeOfUint64 +
len(i.locChunkOffsets)*size.SizeOfUint64 +
i.next.Size()
for _, entry := range i.nextLocs {
sizeInBytes += entry.Size()
} }
// load correct chunk bytes
start := i.freqChunkStart return sizeInBytes
for j := 0; j < chunk; j++ { }
start += i.freqChunkLens[j]
func (i *PostingsIterator) loadChunk(chunk int) error {
if i.includeFreqNorm {
if chunk >= len(i.freqChunkOffsets) {
return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)",
chunk, len(i.freqChunkOffsets))
} }
end := start + i.freqChunkLens[chunk]
end, start := i.freqChunkStart, i.freqChunkStart
s, e := readChunkBoundary(chunk, i.freqChunkOffsets)
start += s
end += e
i.currChunkFreqNorm = i.postings.sb.mem[start:end] i.currChunkFreqNorm = i.postings.sb.mem[start:end]
i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm)) if i.freqNormReader == nil {
i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm)
} else {
i.freqNormReader.Reset(i.currChunkFreqNorm)
}
}
start = i.locChunkStart if i.includeLocs {
for j := 0; j < chunk; j++ { if chunk >= len(i.locChunkOffsets) {
start += i.locChunkLens[j] return fmt.Errorf("tried to load loc chunk that doesn't exist %d/(%d)",
chunk, len(i.locChunkOffsets))
} }
end = start + i.locChunkLens[chunk]
end, start := i.locChunkStart, i.locChunkStart
s, e := readChunkBoundary(chunk, i.locChunkOffsets)
start += s
end += e
i.currChunkLoc = i.postings.sb.mem[start:end] i.currChunkLoc = i.postings.sb.mem[start:end]
i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc)) if i.locReader == nil {
i.locReader = bytes.NewReader(i.currChunkLoc)
} else {
i.locReader.Reset(i.currChunkLoc)
}
}
i.currChunk = uint32(chunk) i.currChunk = uint32(chunk)
return nil return nil
} }
func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) {
freq, err := i.freqNormDecoder.GetU64() if i.normBits1Hit != 0 {
return 1, i.normBits1Hit, false, nil
}
freqHasLocs, err := binary.ReadUvarint(i.freqNormReader)
if err != nil { if err != nil {
return 0, 0, fmt.Errorf("error reading frequency: %v", err) return 0, 0, false, fmt.Errorf("error reading frequency: %v", err)
} }
normBits, err := i.freqNormDecoder.GetU64() freq, hasLocs := decodeFreqHasLocs(freqHasLocs)
normBits, err := binary.ReadUvarint(i.freqNormReader)
if err != nil { if err != nil {
return 0, 0, fmt.Errorf("error reading norm: %v", err) return 0, 0, false, fmt.Errorf("error reading norm: %v", err)
}
return freq, normBits, hasLocs, err
}
func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 {
rv := freq << 1
if hasLocs {
rv = rv | 0x01 // 0'th LSB encodes whether there are locations
} }
return freq, normBits, err return rv
}
func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) {
freq := freqHasLocs >> 1
hasLocs := freqHasLocs&0x01 != 0
return freq, hasLocs
} }
// readLocation processes all the integers on the stream representing a single // readLocation processes all the integers on the stream representing a single
@ -214,27 +454,27 @@ func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) {
// the contents. // the contents.
func (i *PostingsIterator) readLocation(l *Location) error { func (i *PostingsIterator) readLocation(l *Location) error {
// read off field // read off field
fieldID, err := i.locDecoder.GetU64() fieldID, err := binary.ReadUvarint(i.locReader)
if err != nil { if err != nil {
return fmt.Errorf("error reading location field: %v", err) return fmt.Errorf("error reading location field: %v", err)
} }
// read off pos // read off pos
pos, err := i.locDecoder.GetU64() pos, err := binary.ReadUvarint(i.locReader)
if err != nil { if err != nil {
return fmt.Errorf("error reading location pos: %v", err) return fmt.Errorf("error reading location pos: %v", err)
} }
// read off start // read off start
start, err := i.locDecoder.GetU64() start, err := binary.ReadUvarint(i.locReader)
if err != nil { if err != nil {
return fmt.Errorf("error reading location start: %v", err) return fmt.Errorf("error reading location start: %v", err)
} }
// read off end // read off end
end, err := i.locDecoder.GetU64() end, err := binary.ReadUvarint(i.locReader)
if err != nil { if err != nil {
return fmt.Errorf("error reading location end: %v", err) return fmt.Errorf("error reading location end: %v", err)
} }
// read off num array pos // read off num array pos
numArrayPos, err := i.locDecoder.GetU64() numArrayPos, err := binary.ReadUvarint(i.locReader)
if err != nil { if err != nil {
return fmt.Errorf("error reading location num array pos: %v", err) return fmt.Errorf("error reading location num array pos: %v", err)
} }
@ -245,14 +485,16 @@ func (i *PostingsIterator) readLocation(l *Location) error {
l.pos = pos l.pos = pos
l.start = start l.start = start
l.end = end l.end = end
if numArrayPos > 0 { if cap(l.ap) < int(numArrayPos) {
l.ap = make([]uint64, int(numArrayPos)) l.ap = make([]uint64, int(numArrayPos))
} else {
l.ap = l.ap[:int(numArrayPos)]
} }
} }
// read off array positions // read off array positions
for k := 0; k < int(numArrayPos); k++ { for k := 0; k < int(numArrayPos); k++ {
ap, err := i.locDecoder.GetU64() ap, err := binary.ReadUvarint(i.locReader)
if err != nil { if err != nil {
return fmt.Errorf("error reading array position: %v", err) return fmt.Errorf("error reading array position: %v", err)
} }
@ -266,105 +508,340 @@ func (i *PostingsIterator) readLocation(l *Location) error {
// Next returns the next posting on the postings list, or nil at the end // Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) Next() (segment.Posting, error) { func (i *PostingsIterator) Next() (segment.Posting, error) {
if i.actual == nil || !i.actual.HasNext() { return i.nextAtOrAfter(0)
return nil, nil
} }
n := i.actual.Next()
nChunk := n / i.postings.sb.chunkFactor
allN := i.all.Next()
allNChunk := allN / i.postings.sb.chunkFactor
// n is the next actual hit (excluding some postings) // Advance returns the posting at the specified docNum or it is not present
// allN is the next hit in the full postings // the next posting, or if the end is reached, nil
// if they don't match, adjust offsets to factor in item we're skipping over func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) {
// incr the all iterator, and check again return i.nextAtOrAfter(docNum)
for allN != n { }
// in different chunks, reset offsets // Next returns the next posting on the postings list, or nil at the end
if allNChunk != nChunk { func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) {
i.locoffset = 0 docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter)
i.offset = 0 if err != nil || !exists {
} else { return nil, err
}
if i.currChunk != nChunk || i.currChunkFreqNorm == nil { i.next = Posting{} // clear the struct
err := i.loadChunk(int(nChunk)) rv := &i.next
rv.docNum = docNum
if !i.includeFreqNorm {
return rv, nil
}
var normBits uint64
var hasLocs bool
rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
if err != nil { if err != nil {
return nil, fmt.Errorf("error loading chunk: %v", err) return nil, err
}
rv.norm = math.Float32frombits(uint32(normBits))
if i.includeLocs && hasLocs {
// prepare locations into reused slices, where we assume
// rv.freq >= "number of locs", since in a composite field,
// some component fields might have their IncludeTermVector
// flags disabled while other component fields are enabled
if cap(i.nextLocs) >= int(rv.freq) {
i.nextLocs = i.nextLocs[0:rv.freq]
} else {
i.nextLocs = make([]Location, rv.freq, rv.freq*2)
} }
if cap(i.nextSegmentLocs) < int(rv.freq) {
i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2)
} }
rv.locs = i.nextSegmentLocs[:0]
// read off freq/offsets even though we don't care about them numLocsBytes, err := binary.ReadUvarint(i.locReader)
freq, _, err := i.readFreqNorm()
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("error reading location numLocsBytes: %v", err)
} }
if i.locBitmap.Contains(allN) {
for j := 0; j < int(freq); j++ { j := 0
err := i.readLocation(nil) startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader
for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) {
err := i.readLocation(&i.nextLocs[j])
if err != nil { if err != nil {
return nil, err return nil, err
} }
rv.locs = append(rv.locs, &i.nextLocs[j])
j++
}
}
return rv, nil
}
var freqHasLocs1Hit = encodeFreqHasLocs(1, false)
// nextBytes returns the docNum and the encoded freq & loc bytes for
// the next posting
func (i *PostingsIterator) nextBytes() (
docNumOut uint64, freq uint64, normBits uint64,
bytesFreqNorm []byte, bytesLoc []byte, err error) {
docNum, exists, err := i.nextDocNumAtOrAfter(0)
if err != nil || !exists {
return 0, 0, 0, nil, nil, err
}
if i.normBits1Hit != 0 {
if i.buf == nil {
i.buf = make([]byte, binary.MaxVarintLen64*2)
}
n := binary.PutUvarint(i.buf, freqHasLocs1Hit)
n += binary.PutUvarint(i.buf[n:], i.normBits1Hit)
return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil
} }
startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
var hasLocs bool
freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
if err != nil {
return 0, 0, 0, nil, nil, err
}
endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
if hasLocs {
startLoc := len(i.currChunkLoc) - i.locReader.Len()
numLocsBytes, err := binary.ReadUvarint(i.locReader)
if err != nil {
return 0, 0, 0, nil, nil,
fmt.Errorf("error reading location nextBytes numLocs: %v", err)
}
// skip over all the location bytes
_, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent)
if err != nil {
return 0, 0, 0, nil, nil, err
}
endLoc := len(i.currChunkLoc) - i.locReader.Len()
bytesLoc = i.currChunkLoc[startLoc:endLoc]
}
return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil
} }
// in same chunk, need to account for offsets // nextDocNum returns the next docNum on the postings list, and also
i.offset++ // sets up the currChunk / loc related fields of the iterator.
func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) {
if i.normBits1Hit != 0 {
if i.docNum1Hit == DocNum1HitFinished {
return 0, false, nil
}
if i.docNum1Hit < atOrAfter {
// advanced past our 1-hit
i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum
return 0, false, nil
}
docNum := i.docNum1Hit
i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum
return docNum, true, nil
}
if i.Actual == nil || !i.Actual.HasNext() {
return 0, false, nil
}
if i.postings == nil || i.postings.postings == i.ActualBM {
return i.nextDocNumAtOrAfterClean(atOrAfter)
}
n := i.Actual.Next()
for uint64(n) < atOrAfter && i.Actual.HasNext() {
n = i.Actual.Next()
}
if uint64(n) < atOrAfter {
// couldn't find anything
return 0, false, nil
}
allN := i.all.Next()
nChunk := n / i.postings.sb.chunkFactor
// when allN becomes >= to here, then allN is in the same chunk as nChunk.
allNReachesNChunk := nChunk * i.postings.sb.chunkFactor
// n is the next actual hit (excluding some postings), and
// allN is the next hit in the full postings, and
// if they don't match, move 'all' forwards until they do
for allN != n {
// we've reached same chunk, so move the freq/norm/loc decoders forward
if i.includeFreqNorm && allN >= allNReachesNChunk {
err := i.currChunkNext(nChunk)
if err != nil {
return 0, false, err
}
} }
allN = i.all.Next() allN = i.all.Next()
} }
if i.includeFreqNorm && (i.currChunk != nChunk || i.currChunkFreqNorm == nil) {
err := i.loadChunk(int(nChunk))
if err != nil {
return 0, false, fmt.Errorf("error loading chunk: %v", err)
}
}
return uint64(n), true, nil
}
// optimization when the postings list is "clean" (e.g., no updates &
// no deletions) where the all bitmap is the same as the actual bitmap
func (i *PostingsIterator) nextDocNumAtOrAfterClean(
atOrAfter uint64) (uint64, bool, error) {
n := i.Actual.Next()
if !i.includeFreqNorm {
for uint64(n) < atOrAfter && i.Actual.HasNext() {
n = i.Actual.Next()
}
if uint64(n) < atOrAfter {
return 0, false, nil // couldn't find anything
}
return uint64(n), true, nil
}
// freq-norm's needed, so maintain freq-norm chunk reader
sameChunkNexts := 0 // # of times we called Next() in the same chunk
nChunk := n / i.postings.sb.chunkFactor
for uint64(n) < atOrAfter && i.Actual.HasNext() {
n = i.Actual.Next()
nChunkPrev := nChunk
nChunk = n / i.postings.sb.chunkFactor
if nChunk != nChunkPrev {
sameChunkNexts = 0
} else {
sameChunkNexts += 1
}
}
if uint64(n) < atOrAfter {
// couldn't find anything
return 0, false, nil
}
for j := 0; j < sameChunkNexts; j++ {
err := i.currChunkNext(nChunk)
if err != nil {
return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err)
}
}
if i.currChunk != nChunk || i.currChunkFreqNorm == nil { if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
err := i.loadChunk(int(nChunk)) err := i.loadChunk(int(nChunk))
if err != nil { if err != nil {
return nil, fmt.Errorf("error loading chunk: %v", err) return 0, false, fmt.Errorf("error loading chunk: %v", err)
} }
} }
i.next = Posting{} // clear the struct. return uint64(n), true, nil
rv := &i.next }
rv.iterator = i
rv.docNum = uint64(n)
var err error func (i *PostingsIterator) currChunkNext(nChunk uint32) error {
var normBits uint64 if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
rv.freq, normBits, err = i.readFreqNorm() err := i.loadChunk(int(nChunk))
if err != nil { if err != nil {
return nil, err return fmt.Errorf("error loading chunk: %v", err)
} }
rv.norm = math.Float32frombits(uint32(normBits)) }
if i.locBitmap.Contains(n) {
// read off 'freq' locations // read off freq/offsets even though we don't care about them
rv.locs = make([]segment.Location, rv.freq) _, _, hasLocs, err := i.readFreqNormHasLocs()
locs := make([]Location, rv.freq)
for j := 0; j < int(rv.freq); j++ {
err := i.readLocation(&locs[j])
if err != nil { if err != nil {
return nil, err return err
}
if i.includeLocs && hasLocs {
numLocsBytes, err := binary.ReadUvarint(i.locReader)
if err != nil {
return fmt.Errorf("error reading location numLocsBytes: %v", err)
} }
rv.locs[j] = &locs[j]
// skip over all the location bytes
_, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent)
if err != nil {
return err
} }
} }
return rv, nil return nil
}
// DocNum1Hit returns the docNum and true if this is "1-hit" optimized
// and the docNum is available.
func (p *PostingsIterator) DocNum1Hit() (uint64, bool) {
if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished {
return p.docNum1Hit, true
}
return 0, false
}
// PostingsIteratorFromBitmap constructs a PostingsIterator given an
// "actual" bitmap.
func PostingsIteratorFromBitmap(bm *roaring.Bitmap,
includeFreqNorm, includeLocs bool) (*PostingsIterator, error) {
return &PostingsIterator{
ActualBM: bm,
Actual: bm.Iterator(),
includeFreqNorm: includeFreqNorm,
includeLocs: includeLocs,
}, nil
}
// PostingsIteratorFrom1Hit constructs a PostingsIterator given a
// 1-hit docNum.
func PostingsIteratorFrom1Hit(docNum1Hit, normBits1Hit uint64,
includeFreqNorm, includeLocs bool) (*PostingsIterator, error) {
return &PostingsIterator{
docNum1Hit: docNum1Hit,
normBits1Hit: normBits1Hit,
includeFreqNorm: includeFreqNorm,
includeLocs: includeLocs,
}, nil
} }
// Posting is a single entry in a postings list // Posting is a single entry in a postings list
type Posting struct { type Posting struct {
iterator *PostingsIterator
docNum uint64 docNum uint64
freq uint64 freq uint64
norm float32 norm float32
locs []segment.Location locs []segment.Location
} }
func (p *Posting) Size() int {
sizeInBytes := reflectStaticSizePosting
for _, entry := range p.locs {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
// Number returns the document number of this posting in this segment // Number returns the document number of this posting in this segment
func (p *Posting) Number() uint64 { func (p *Posting) Number() uint64 {
return p.docNum return p.docNum
} }
// Frequency returns the frequence of occurance of this term in this doc/field // Frequency returns the frequencies of occurrence of this term in this doc/field
func (p *Posting) Frequency() uint64 { func (p *Posting) Frequency() uint64 {
return p.freq return p.freq
} }
@ -374,12 +851,12 @@ func (p *Posting) Norm() float64 {
return float64(p.norm) return float64(p.norm)
} }
// Locations returns the location information for each occurance // Locations returns the location information for each occurrence
func (p *Posting) Locations() []segment.Location { func (p *Posting) Locations() []segment.Location {
return p.locs return p.locs
} }
// Location represents the location of a single occurance // Location represents the location of a single occurrence
type Location struct { type Location struct {
field string field string
pos uint64 pos uint64
@ -388,28 +865,34 @@ type Location struct {
ap []uint64 ap []uint64
} }
func (l *Location) Size() int {
return reflectStaticSizeLocation +
len(l.field) +
len(l.ap)*size.SizeOfUint64
}
// Field returns the name of the field (useful in composite fields to know // Field returns the name of the field (useful in composite fields to know
// which original field the value came from) // which original field the value came from)
func (l *Location) Field() string { func (l *Location) Field() string {
return l.field return l.field
} }
// Start returns the start byte offset of this occurance // Start returns the start byte offset of this occurrence
func (l *Location) Start() uint64 { func (l *Location) Start() uint64 {
return l.start return l.start
} }
// End returns the end byte offset of this occurance // End returns the end byte offset of this occurrence
func (l *Location) End() uint64 { func (l *Location) End() uint64 {
return l.end return l.end
} }
// Pos returns the 1-based phrase position of this occurance // Pos returns the 1-based phrase position of this occurrence
func (l *Location) Pos() uint64 { func (l *Location) Pos() uint64 {
return l.pos return l.pos
} }
// ArrayPositions returns the array position vector associated with this occurance // ArrayPositions returns the array position vector associated with this occurrence
func (l *Location) ArrayPositions() []uint64 { func (l *Location) ArrayPositions() []uint64 {
return l.ap return l.ap
} }

@ -20,16 +20,24 @@ import (
"fmt" "fmt"
"io" "io"
"os" "os"
"reflect"
"sync" "sync"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
"github.com/couchbase/vellum" "github.com/couchbase/vellum"
mmap "github.com/edsrzf/mmap-go" mmap "github.com/edsrzf/mmap-go"
"github.com/golang/snappy" "github.com/golang/snappy"
) )
var reflectStaticSizeSegmentBase int
func init() {
var sb SegmentBase
reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size())
}
// Open returns a zap impl of a segment // Open returns a zap impl of a segment
func Open(path string) (segment.Segment, error) { func Open(path string) (segment.Segment, error) {
f, err := os.Open(path) f, err := os.Open(path)
@ -47,13 +55,14 @@ func Open(path string) (segment.Segment, error) {
SegmentBase: SegmentBase{ SegmentBase: SegmentBase{
mem: mm[0 : len(mm)-FooterSize], mem: mm[0 : len(mm)-FooterSize],
fieldsMap: make(map[string]uint16), fieldsMap: make(map[string]uint16),
fieldDvIterMap: make(map[uint16]*docValueIterator), fieldDvReaders: make(map[uint16]*docValueReader),
}, },
f: f, f: f,
mm: mm, mm: mm,
path: path, path: path,
refs: 1, refs: 1,
} }
rv.SegmentBase.updateSize()
err = rv.loadConfig() err = rv.loadConfig()
if err != nil { if err != nil {
@ -67,7 +76,7 @@ func Open(path string) (segment.Segment, error) {
return nil, err return nil, err
} }
err = rv.loadDvIterators() err = rv.loadDvReaders()
if err != nil { if err != nil {
_ = rv.Close() _ = rv.Close()
return nil, err return nil, err
@ -89,7 +98,39 @@ type SegmentBase struct {
fieldsIndexOffset uint64 fieldsIndexOffset uint64
docValueOffset uint64 docValueOffset uint64
dictLocs []uint64 dictLocs []uint64
fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field
fieldDvNames []string // field names cached in fieldDvReaders
size uint64
}
func (sb *SegmentBase) Size() int {
return int(sb.size)
}
func (sb *SegmentBase) updateSize() {
sizeInBytes := reflectStaticSizeSegmentBase +
cap(sb.mem)
// fieldsMap
for k, _ := range sb.fieldsMap {
sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16
}
// fieldsInv, dictLocs
for _, entry := range sb.fieldsInv {
sizeInBytes += len(entry) + size.SizeOfString
}
sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64
// fieldDvReaders
for _, v := range sb.fieldDvReaders {
sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr
if v != nil {
sizeInBytes += v.size()
}
}
sb.size = uint64(sizeInBytes)
} }
func (sb *SegmentBase) AddRef() {} func (sb *SegmentBase) AddRef() {}
@ -111,56 +152,19 @@ type Segment struct {
refs int64 refs int64
} }
func (s *Segment) SizeInBytes() uint64 { func (s *Segment) Size() int {
// 8 /* size of file pointer */ // 8 /* size of file pointer */
// 4 /* size of version -> uint32 */ // 4 /* size of version -> uint32 */
// 4 /* size of crc -> uint32 */ // 4 /* size of crc -> uint32 */
sizeOfUints := 16 sizeOfUints := 16
sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints
// mutex, refs -> int64 // mutex, refs -> int64
sizeInBytes += 16 sizeInBytes += 16
// do not include the mmap'ed part // do not include the mmap'ed part
return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem)) return sizeInBytes + s.SegmentBase.Size() - cap(s.mem)
}
func (s *SegmentBase) SizeInBytes() uint64 {
// 4 /* size of memCRC -> uint32 */
// 4 /* size of chunkFactor -> uint32 */
// 8 /* size of numDocs -> uint64 */
// 8 /* size of storedIndexOffset -> uint64 */
// 8 /* size of fieldsIndexOffset -> uint64 */
// 8 /* size of docValueOffset -> uint64 */
sizeInBytes := 40
sizeInBytes += len(s.mem) + int(segment.SizeOfSlice)
// fieldsMap
for k, _ := range s.fieldsMap {
sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */
}
sizeInBytes += int(segment.SizeOfMap) /* overhead from map */
// fieldsInv, dictLocs
for _, entry := range s.fieldsInv {
sizeInBytes += (len(entry) + int(segment.SizeOfString))
}
sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */
sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */
// fieldDvIterMap
sizeInBytes += len(s.fieldDvIterMap) *
int(segment.SizeOfPointer+2 /* size of uint16 */)
for _, entry := range s.fieldDvIterMap {
if entry != nil {
sizeInBytes += int(entry.sizeInBytes())
}
}
sizeInBytes += int(segment.SizeOfMap)
return uint64(sizeInBytes)
} }
func (s *Segment) AddRef() { func (s *Segment) AddRef() {
@ -185,7 +189,7 @@ func (s *Segment) loadConfig() error {
verOffset := crcOffset - 4 verOffset := crcOffset - 4
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
if s.version != version { if s.version != Version {
return fmt.Errorf("unsupported version %d", s.version) return fmt.Errorf("unsupported version %d", s.version)
} }
@ -207,7 +211,7 @@ func (s *Segment) loadConfig() error {
} }
func (s *SegmentBase) loadFields() error { func (s *SegmentBase) loadFields() error {
// NOTE for now we assume the fields index immediately preceeds // NOTE for now we assume the fields index immediately precedes
// the footer, and if this changes, need to adjust accordingly (or // the footer, and if this changes, need to adjust accordingly (or
// store explicit length), where s.mem was sliced from s.mm in Open(). // store explicit length), where s.mem was sliced from s.mm in Open().
fieldsIndexEnd := uint64(len(s.mem)) fieldsIndexEnd := uint64(len(s.mem))
@ -262,6 +266,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
} }
rv.fstReader, err = rv.fst.Reader()
if err != nil {
return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err)
}
} }
} }
} }
@ -269,50 +277,90 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
return rv, nil return rv, nil
} }
// visitDocumentCtx holds data structures that are reusable across
// multiple VisitDocument() calls to avoid memory allocations
type visitDocumentCtx struct {
buf []byte
reader bytes.Reader
arrayPos []uint64
}
var visitDocumentCtxPool = sync.Pool{
New: func() interface{} {
reuse := &visitDocumentCtx{}
return reuse
},
}
// VisitDocument invokes the DocFieldValueVistor for each stored field // VisitDocument invokes the DocFieldValueVistor for each stored field
// for the specified doc number // for the specified doc number
func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
defer visitDocumentCtxPool.Put(vdc)
return s.visitDocument(vdc, num, visitor)
}
func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64,
visitor segment.DocumentFieldValueVisitor) error {
// first make sure this is a valid number in this segment // first make sure this is a valid number in this segment
if num < s.numDocs { if num < s.numDocs {
meta, compressed := s.getDocStoredMetaAndCompressed(num) meta, compressed := s.getDocStoredMetaAndCompressed(num)
uncompressed, err := snappy.Decode(nil, compressed)
vdc.reader.Reset(meta)
// handle _id field special case
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
idFieldVal := compressed[:idFieldValLen]
keepGoing := visitor("_id", byte('t'), idFieldVal, nil)
if !keepGoing {
visitDocumentCtxPool.Put(vdc)
return nil
}
// handle non-"_id" fields
compressed = compressed[idFieldValLen:]
uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed)
if err != nil { if err != nil {
return err return err
} }
// now decode meta and process
reader := bytes.NewReader(meta)
decoder := govarint.NewU64Base128Decoder(reader)
keepGoing := true
for keepGoing { for keepGoing {
field, err := decoder.GetU64() field, err := binary.ReadUvarint(&vdc.reader)
if err == io.EOF { if err == io.EOF {
break break
} }
if err != nil { if err != nil {
return err return err
} }
typ, err := decoder.GetU64() typ, err := binary.ReadUvarint(&vdc.reader)
if err != nil { if err != nil {
return err return err
} }
offset, err := decoder.GetU64() offset, err := binary.ReadUvarint(&vdc.reader)
if err != nil { if err != nil {
return err return err
} }
l, err := decoder.GetU64() l, err := binary.ReadUvarint(&vdc.reader)
if err != nil { if err != nil {
return err return err
} }
numap, err := decoder.GetU64() numap, err := binary.ReadUvarint(&vdc.reader)
if err != nil { if err != nil {
return err return err
} }
var arrayPos []uint64 var arrayPos []uint64
if numap > 0 { if numap > 0 {
arrayPos = make([]uint64, numap) if cap(vdc.arrayPos) < int(numap) {
vdc.arrayPos = make([]uint64, numap)
}
arrayPos = vdc.arrayPos[:numap]
for i := 0; i < int(numap); i++ { for i := 0; i < int(numap); i++ {
ap, err := decoder.GetU64() ap, err := binary.ReadUvarint(&vdc.reader)
if err != nil { if err != nil {
return err return err
} }
@ -323,10 +371,36 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal
value := uncompressed[offset : offset+l] value := uncompressed[offset : offset+l]
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
} }
vdc.buf = uncompressed
} }
return nil return nil
} }
// DocID returns the value of the _id field for the given docNum
func (s *SegmentBase) DocID(num uint64) ([]byte, error) {
if num >= s.numDocs {
return nil, nil
}
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
meta, compressed := s.getDocStoredMetaAndCompressed(num)
vdc.reader.Reset(meta)
// handle _id field special case
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return nil, err
}
idFieldVal := compressed[:idFieldValLen]
visitDocumentCtxPool.Put(vdc)
return idFieldVal, nil
}
// Count returns the number of documents in this segment. // Count returns the number of documents in this segment.
func (s *SegmentBase) Count() uint64 { func (s *SegmentBase) Count() uint64 {
return s.numDocs return s.numDocs
@ -343,16 +417,27 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
return nil, err return nil, err
} }
var postings *PostingsList postingsList := emptyPostingsList
for _, id := range ids {
postings, err = idDict.postingsList([]byte(id), nil, postings) sMax, err := idDict.fst.GetMaxKey()
if err != nil { if err != nil {
return nil, err return nil, err
} }
if postings.postings != nil { sMaxStr := string(sMax)
rv.Or(postings.postings) filteredIds := make([]string, 0, len(ids))
for _, id := range ids {
if id <= sMaxStr {
filteredIds = append(filteredIds, id)
} }
} }
for _, id := range filteredIds {
postingsList, err = idDict.postingsList([]byte(id), nil, postingsList)
if err != nil {
return nil, err
}
postingsList.OrInto(rv)
}
} }
return rv, nil return rv, nil
@ -441,19 +526,32 @@ func (s *Segment) DictAddr(field string) (uint64, error) {
return s.dictLocs[fieldIDPlus1-1], nil return s.dictLocs[fieldIDPlus1-1], nil
} }
func (s *SegmentBase) loadDvIterators() error { func (s *SegmentBase) loadDvReaders() error {
if s.docValueOffset == fieldNotUninverted { if s.docValueOffset == fieldNotUninverted {
return nil return nil
} }
var read uint64 var read uint64
for fieldID, field := range s.fieldsInv { for fieldID, field := range s.fieldsInv {
fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) var fieldLocStart, fieldLocEnd uint64
var n int
fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
if n <= 0 { if n <= 0 {
return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID)
} }
s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc)
read += uint64(n) read += uint64(n)
fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
if n <= 0 {
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID)
} }
read += uint64(n)
fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd)
if fieldDvReader != nil {
s.fieldDvReaders[uint16(fieldID)] = fieldDvReader
s.fieldDvNames = append(s.fieldDvNames, field)
}
}
return nil return nil
} }

@ -15,7 +15,6 @@
package zap package zap
import ( import (
"bytes"
"encoding/binary" "encoding/binary"
"io" "io"
@ -25,28 +24,29 @@ import (
// writes out the length of the roaring bitmap in bytes as varint // writes out the length of the roaring bitmap in bytes as varint
// then writes out the roaring bitmap itself // then writes out the roaring bitmap itself
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) { reuseBufVarint []byte) (int, error) {
reuseBuf.Reset() buf, err := r.ToBytes()
// write out postings list to memory so we know the len
postingsListLen, err := r.WriteTo(reuseBuf)
if err != nil { if err != nil {
return 0, err return 0, err
} }
var tw int var tw int
// write out the length of this postings list
n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen)) // write out the length
n := binary.PutUvarint(reuseBufVarint, uint64(len(buf)))
nw, err := w.Write(reuseBufVarint[:n]) nw, err := w.Write(reuseBufVarint[:n])
tw += nw tw += nw
if err != nil { if err != nil {
return tw, err return tw, err
} }
// write out the postings list itself
nw, err = w.Write(reuseBuf.Bytes()) // write out the roaring bytes
nw, err = w.Write(buf)
tw += nw tw += nw
if err != nil { if err != nil {
return tw, err return tw, err
} }
return tw, nil return tw, nil
} }
@ -118,7 +118,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset
return err return err
} }
// write out 32-bit version // write out 32-bit version
err = binary.Write(w, binary.BigEndian, version) err = binary.Write(w, binary.BigEndian, Version)
if err != nil { if err != nil {
return err return err
} }

@ -15,10 +15,10 @@
package scorch package scorch
import ( import (
"bytes"
"container/heap" "container/heap"
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"reflect"
"sort" "sort"
"sync" "sync"
"sync/atomic" "sync/atomic"
@ -27,8 +27,13 @@ import (
"github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum"
lev2 "github.com/couchbase/vellum/levenshtein2"
) )
// re usable, threadsafe levenshtein builders
var lb1, lb2 *lev2.LevenshteinAutomatonBuilder
type asynchSegmentResult struct { type asynchSegmentResult struct {
dictItr segment.DictionaryIterator dictItr segment.DictionaryIterator
@ -40,15 +45,36 @@ type asynchSegmentResult struct {
err error err error
} }
var reflectStaticSizeIndexSnapshot int
func init() {
var is interface{} = IndexSnapshot{}
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size())
var err error
lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
}
lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
}
}
type IndexSnapshot struct { type IndexSnapshot struct {
parent *Scorch parent *Scorch
segment []*SegmentSnapshot segment []*SegmentSnapshot
offsets []uint64 offsets []uint64
internal map[string][]byte internal map[string][]byte
epoch uint64 epoch uint64
size uint64
creator string
m sync.Mutex // Protects the fields that follow. m sync.Mutex // Protects the fields that follow.
refs int64 refs int64
m2 sync.Mutex // Protects the fields that follow.
fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's
} }
func (i *IndexSnapshot) Segments() []*SegmentSnapshot { func (i *IndexSnapshot) Segments() []*SegmentSnapshot {
@ -85,12 +111,27 @@ func (i *IndexSnapshot) DecRef() (err error) {
return err return err
} }
func (i *IndexSnapshot) Close() error {
return i.DecRef()
}
func (i *IndexSnapshot) Size() int {
return int(i.size)
}
func (i *IndexSnapshot) updateSize() {
i.size += uint64(reflectStaticSizeIndexSnapshot)
for _, s := range i.segment {
i.size += uint64(s.Size())
}
}
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) {
results := make(chan *asynchSegmentResult) results := make(chan *asynchSegmentResult)
for index, segment := range i.segment { for index, segment := range i.segment {
go func(index int, segment *SegmentSnapshot) { go func(index int, segment *SegmentSnapshot) {
dict, err := segment.Dictionary(field) dict, err := segment.segment.Dictionary(field)
if err != nil { if err != nil {
results <- &asynchSegmentResult{err: err} results <- &asynchSegmentResult{err: err}
} else { } else {
@ -116,7 +157,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
if next != nil { if next != nil {
rv.cursors = append(rv.cursors, &segmentDictCursor{ rv.cursors = append(rv.cursors, &segmentDictCursor{
itr: asr.dictItr, itr: asr.dictItr,
curr: next, curr: *next,
}) })
} }
} }
@ -151,6 +192,56 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
}) })
} }
func (i *IndexSnapshot) FieldDictRegexp(field string,
termRegex string) (index.FieldDict, error) {
// TODO: potential optimization where the literal prefix represents the,
// entire regexp, allowing us to use PrefixIterator(prefixTerm)?
a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex)
if err != nil {
return nil, err
}
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
})
}
func (i *IndexSnapshot) getLevAutomaton(term string,
fuzziness uint8) (vellum.Automaton, error) {
if fuzziness == 1 {
return lb1.BuildDfa(term, fuzziness)
} else if fuzziness == 2 {
return lb2.BuildDfa(term, fuzziness)
}
return nil, fmt.Errorf("fuzziness exceeds the max limit")
}
func (i *IndexSnapshot) FieldDictFuzzy(field string,
term string, fuzziness int, prefix string) (index.FieldDict, error) {
a, err := i.getLevAutomaton(term, uint8(fuzziness))
if err != nil {
return nil, err
}
var prefixBeg, prefixEnd []byte
if prefix != "" {
prefixBeg = []byte(prefix)
prefixEnd = segment.IncrementBytes(prefixBeg)
}
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
})
}
func (i *IndexSnapshot) FieldDictOnly(field string,
onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.OnlyIterator(onlyTerms, includeCount)
})
}
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
results := make(chan *asynchSegmentResult) results := make(chan *asynchSegmentResult)
for index, segment := range i.segment { for index, segment := range i.segment {
@ -264,21 +355,26 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) {
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
rv = document.NewDocument(id) rv = document.NewDocument(id)
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool {
if name == "_id" { if name == "_id" {
return true return true
} }
// copy value, array positions to preserve them beyond the scope of this callback
value := append([]byte(nil), val...)
arrayPos := append([]uint64(nil), pos...)
switch typ { switch typ {
case 't': case 't':
rv.AddField(document.NewTextField(name, pos, value)) rv.AddField(document.NewTextField(name, arrayPos, value))
case 'n': case 'n':
rv.AddField(document.NewNumericFieldFromBytes(name, pos, value)) rv.AddField(document.NewNumericFieldFromBytes(name, arrayPos, value))
case 'd': case 'd':
rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value)) rv.AddField(document.NewDateTimeFieldFromBytes(name, arrayPos, value))
case 'b': case 'b':
rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value)) rv.AddField(document.NewBooleanFieldFromBytes(name, arrayPos, value))
case 'g': case 'g':
rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value)) rv.AddField(document.NewGeoPointFieldFromBytes(name, arrayPos, value))
} }
return true return true
@ -307,26 +403,17 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
} }
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
var found bool v, err := i.segment[segmentIndex].DocID(localDocNum)
var rv string
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool {
if field == "_id" {
found = true
rv = string(value)
return false
}
return true
})
if err != nil { if err != nil {
return "", err return "", err
} }
if v == nil {
if found {
return rv, nil
}
return "", fmt.Errorf("document number %d not found", docNum) return "", fmt.Errorf("document number %d not found", docNum)
} }
return string(v), nil
}
func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) { func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) {
// FIXME could be done more efficiently directly, but reusing for simplicity // FIXME could be done more efficiently directly, but reusing for simplicity
tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false) tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false)
@ -349,33 +436,81 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err
func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
rv := i.allocTermFieldReaderDicts(field)
rv := &IndexSnapshotTermFieldReader{
term: term, rv.term = term
field: field, rv.field = field
snapshot: i, rv.snapshot = i
postings: make([]segment.PostingsList, len(i.segment)), if rv.postings == nil {
iterators: make([]segment.PostingsIterator, len(i.segment)), rv.postings = make([]segment.PostingsList, len(i.segment))
includeFreq: includeFreq, }
includeNorm: includeNorm, if rv.iterators == nil {
includeTermVectors: includeTermVectors, rv.iterators = make([]segment.PostingsIterator, len(i.segment))
} }
rv.segmentOffset = 0
rv.includeFreq = includeFreq
rv.includeNorm = includeNorm
rv.includeTermVectors = includeTermVectors
rv.currPosting = nil
rv.currID = rv.currID[:0]
if rv.dicts == nil {
rv.dicts = make([]segment.TermDictionary, len(i.segment))
for i, segment := range i.segment { for i, segment := range i.segment {
dict, err := segment.Dictionary(field) dict, err := segment.segment.Dictionary(field)
if err != nil { if err != nil {
return nil, err return nil, err
} }
pl, err := dict.PostingsList(string(term), nil) rv.dicts[i] = dict
}
}
for i, segment := range i.segment {
pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i])
if err != nil { if err != nil {
return nil, err return nil, err
} }
rv.postings[i] = pl rv.postings[i] = pl
rv.iterators[i] = pl.Iterator() rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i])
} }
atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1)) atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1))
return rv, nil return rv, nil
} }
func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) {
i.m2.Lock()
if i.fieldTFRs != nil {
tfrs := i.fieldTFRs[field]
last := len(tfrs) - 1
if last >= 0 {
tfr = tfrs[last]
tfrs[last] = nil
i.fieldTFRs[field] = tfrs[:last]
i.m2.Unlock()
return
}
}
i.m2.Unlock()
return &IndexSnapshotTermFieldReader{}
}
func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) {
i.parent.rootLock.RLock()
obsolete := i.parent.root != i
i.parent.rootLock.RUnlock()
if obsolete {
// if we're not the current root (mutations happened), don't bother recycling
return
}
i.m2.Lock()
if i.fieldTFRs == nil {
i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{}
}
i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr)
i.m2.Unlock()
}
func docNumberToBytes(buf []byte, in uint64) []byte { func docNumberToBytes(buf []byte, in uint64) []byte {
if len(buf) != 8 { if len(buf) != 8 {
if cap(buf) >= 8 { if cap(buf) >= 8 {
@ -389,115 +524,172 @@ func docNumberToBytes(buf []byte, in uint64) []byte {
} }
func docInternalToNumber(in index.IndexInternalID) (uint64, error) { func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
var res uint64 if len(in) != 8 {
err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res) return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in)
if err != nil {
return 0, err
} }
return res, nil return binary.BigEndian.Uint64(in), nil
} }
func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID,
fields []string, visitor index.DocumentFieldTermVisitor) error { fields []string, visitor index.DocumentFieldTermVisitor) error {
_, err := i.documentVisitFieldTerms(id, fields, visitor, nil)
return err
}
func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID,
fields []string, visitor index.DocumentFieldTermVisitor,
dvs segment.DocVisitState) (segment.DocVisitState, error) {
docNum, err := docInternalToNumber(id) docNum, err := docInternalToNumber(id)
if err != nil { if err != nil {
return err return nil, err
} }
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
if segmentIndex >= len(i.segment) { if segmentIndex >= len(i.segment) {
return nil return nil, nil
}
_, dvs, err = i.documentVisitFieldTermsOnSegment(
segmentIndex, localDocNum, fields, nil, visitor, dvs)
return dvs, err
} }
func (i *IndexSnapshot) documentVisitFieldTermsOnSegment(
segmentIndex int, localDocNum uint64, fields []string, cFields []string,
visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) (
cFieldsOut []string, dvsOut segment.DocVisitState, err error) {
ss := i.segment[segmentIndex] ss := i.segment[segmentIndex]
if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { var vFields []string // fields that are visitable via the segment
// get the list of doc value persisted fields
pFields, err := zaps.VisitableDocValueFields() ssv, ssvOk := ss.segment.(segment.DocumentFieldTermVisitable)
if ssvOk && ssv != nil {
vFields, err = ssv.VisitableDocValueFields()
if err != nil { if err != nil {
return err return nil, nil, err
} }
// assort the fields for which terms look up have to
// be performed runtime
dvPendingFields := extractDvPendingFields(fields, pFields)
if len(dvPendingFields) == 0 {
// all fields are doc value persisted
return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
} }
// concurrently trigger the runtime doc value preparations for var errCh chan error
// pending fields as well as the visit of the persisted doc values
errCh := make(chan error, 1) // cFields represents the fields that we'll need from the
// cachedDocs, and might be optionally be provided by the caller,
// if the caller happens to know we're on the same segmentIndex
// from a previous invocation
if cFields == nil {
cFields = subtractStrings(fields, vFields)
if !ss.cachedDocs.hasFields(cFields) {
errCh = make(chan error, 1)
go func() { go func() {
defer close(errCh) err := ss.cachedDocs.prepareFields(cFields, ss)
err := ss.cachedDocs.prepareFields(fields, ss)
if err != nil { if err != nil {
errCh <- err errCh <- err
} }
close(errCh)
}() }()
}
}
// visit the persisted dv while the cache preparation is in progress if ssvOk && ssv != nil && len(vFields) > 0 {
err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs)
if err != nil { if err != nil {
return err return nil, nil, err
}
} }
// err out if fieldCache preparation failed if errCh != nil {
err = <-errCh err = <-errCh
if err != nil { if err != nil {
return err return nil, nil, err
}
} }
visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor) if len(cFields) > 0 {
return nil ss.cachedDocs.visitDoc(localDocNum, cFields, visitor)
}
return cFields, dvs, nil
} }
return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor) func (i *IndexSnapshot) DocValueReader(fields []string) (
index.DocValueReader, error) {
return &DocValueReader{i: i, fields: fields, currSegmentIndex: -1}, nil
} }
func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string, type DocValueReader struct {
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error { i *IndexSnapshot
err := ss.cachedDocs.prepareFields(fields, ss) fields []string
dvs segment.DocVisitState
currSegmentIndex int
currCachedFields []string
}
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
visitor index.DocumentFieldTermVisitor) (err error) {
docNum, err := docInternalToNumber(id)
if err != nil { if err != nil {
return err return err
} }
visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor) segmentIndex, localDocNum := dvr.i.segmentIndexAndLocalDocNumFromGlobal(docNum)
if segmentIndex >= len(dvr.i.segment) {
return nil return nil
} }
func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string, if dvr.currSegmentIndex != segmentIndex {
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) { dvr.currSegmentIndex = segmentIndex
dvr.currCachedFields = nil
for _, field := range fields {
if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists {
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
for {
i := bytes.Index(tlist, TermSeparatorSplitSlice)
if i < 0 {
break
}
visitor(field, tlist[0:i])
tlist = tlist[i+1:]
} }
dvr.currCachedFields, dvr.dvs, err = dvr.i.documentVisitFieldTermsOnSegment(
dvr.currSegmentIndex, localDocNum, dvr.fields, dvr.currCachedFields, visitor, dvr.dvs)
return err
} }
func (i *IndexSnapshot) DumpAll() chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
} }
func (i *IndexSnapshot) DumpDoc(id string) chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
} }
func (i *IndexSnapshot) DumpFields() chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
} }
func extractDvPendingFields(requestedFields, persistedFields []string) []string { // subtractStrings returns set a minus elements of set b.
removeMap := map[string]struct{}{} func subtractStrings(a, b []string) []string {
for _, str := range persistedFields { if len(b) == 0 {
removeMap[str] = struct{}{} return a
} }
rv := make([]string, 0, len(requestedFields)) rv := make([]string, 0, len(a))
for _, s := range requestedFields { OUTER:
if _, ok := removeMap[s]; !ok { for _, as := range a {
rv = append(rv, s) for _, bs := range b {
if as == bs {
continue OUTER
}
} }
rv = append(rv, as)
} }
return rv return rv
} }

@ -23,12 +23,13 @@ import (
type segmentDictCursor struct { type segmentDictCursor struct {
itr segment.DictionaryIterator itr segment.DictionaryIterator
curr *index.DictEntry curr index.DictEntry
} }
type IndexSnapshotFieldDict struct { type IndexSnapshotFieldDict struct {
snapshot *IndexSnapshot snapshot *IndexSnapshot
cursors []*segmentDictCursor cursors []*segmentDictCursor
entry index.DictEntry
} }
func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) } func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) }
@ -51,10 +52,10 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} {
} }
func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
if len(i.cursors) <= 0 { if len(i.cursors) == 0 {
return nil, nil return nil, nil
} }
rv := i.cursors[0].curr i.entry = i.cursors[0].curr
next, err := i.cursors[0].itr.Next() next, err := i.cursors[0].itr.Next()
if err != nil { if err != nil {
return nil, err return nil, err
@ -64,12 +65,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
heap.Pop(i) heap.Pop(i)
} else { } else {
// modified heap, fix it // modified heap, fix it
i.cursors[0].curr = next i.cursors[0].curr = *next
heap.Fix(i, 0) heap.Fix(i, 0)
} }
// look for any other entries with the exact same term // look for any other entries with the exact same term
for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term { for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term {
rv.Count += i.cursors[0].curr.Count i.entry.Count += i.cursors[0].curr.Count
next, err := i.cursors[0].itr.Next() next, err := i.cursors[0].itr.Next()
if err != nil { if err != nil {
return nil, err return nil, err
@ -79,12 +80,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
heap.Pop(i) heap.Pop(i)
} else { } else {
// modified heap, fix it // modified heap, fix it
i.cursors[0].curr = next i.cursors[0].curr = *next
heap.Fix(i, 0) heap.Fix(i, 0)
} }
} }
return rv, nil return &i.entry, nil
} }
func (i *IndexSnapshotFieldDict) Close() error { func (i *IndexSnapshotFieldDict) Close() error {

@ -16,17 +16,30 @@ package scorch
import ( import (
"bytes" "bytes"
"reflect"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeIndexSnapshotDocIDReader int
func init() {
var isdr IndexSnapshotDocIDReader
reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size())
}
type IndexSnapshotDocIDReader struct { type IndexSnapshotDocIDReader struct {
snapshot *IndexSnapshot snapshot *IndexSnapshot
iterators []roaring.IntIterable iterators []roaring.IntIterable
segmentOffset int segmentOffset int
} }
func (i *IndexSnapshotDocIDReader) Size() int {
return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr
}
func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) {
for i.segmentOffset < len(i.iterators) { for i.segmentOffset < len(i.iterators) {
if !i.iterators[i.segmentOffset].HasNext() { if !i.iterators[i.segmentOffset].HasNext() {

@ -16,16 +16,27 @@ package scorch
import ( import (
"bytes" "bytes"
"fmt"
"reflect"
"sync/atomic" "sync/atomic"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeIndexSnapshotTermFieldReader int
func init() {
var istfr IndexSnapshotTermFieldReader
reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size())
}
type IndexSnapshotTermFieldReader struct { type IndexSnapshotTermFieldReader struct {
term []byte term []byte
field string field string
snapshot *IndexSnapshot snapshot *IndexSnapshot
dicts []segment.TermDictionary
postings []segment.PostingsList postings []segment.PostingsList
iterators []segment.PostingsIterator iterators []segment.PostingsIterator
segmentOffset int segmentOffset int
@ -36,13 +47,34 @@ type IndexSnapshotTermFieldReader struct {
currID index.IndexInternalID currID index.IndexInternalID
} }
func (i *IndexSnapshotTermFieldReader) Size() int {
sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr +
len(i.term) +
len(i.field) +
len(i.currID)
for _, entry := range i.postings {
sizeInBytes += entry.Size()
}
for _, entry := range i.iterators {
sizeInBytes += entry.Size()
}
if i.currPosting != nil {
sizeInBytes += i.currPosting.Size()
}
return sizeInBytes
}
func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
rv := preAlloced rv := preAlloced
if rv == nil { if rv == nil {
rv = &index.TermFieldDoc{} rv = &index.TermFieldDoc{}
} }
// find the next hit // find the next hit
for i.segmentOffset < len(i.postings) { for i.segmentOffset < len(i.iterators) {
next, err := i.iterators[i.segmentOffset].Next() next, err := i.iterators[i.segmentOffset].Next()
if err != nil { if err != nil {
return nil, err return nil, err
@ -72,9 +104,16 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin
} }
if i.includeTermVectors { if i.includeTermVectors {
locs := next.Locations() locs := next.Locations()
if cap(rv.Vectors) < len(locs) {
rv.Vectors = make([]*index.TermFieldVector, len(locs)) rv.Vectors = make([]*index.TermFieldVector, len(locs))
backing := make([]index.TermFieldVector, len(locs))
for i := range backing {
rv.Vectors[i] = &backing[i]
}
}
rv.Vectors = rv.Vectors[:len(locs)]
for i, loc := range locs { for i, loc := range locs {
rv.Vectors[i] = &index.TermFieldVector{ *rv.Vectors[i] = index.TermFieldVector{
Start: loc.Start(), Start: loc.Start(),
End: loc.End(), End: loc.End(),
Pos: loc.Pos(), Pos: loc.Pos(),
@ -96,24 +135,37 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo
} }
*i = *(i2.(*IndexSnapshotTermFieldReader)) *i = *(i2.(*IndexSnapshotTermFieldReader))
} }
// FIXME do something better num, err := docInternalToNumber(ID)
next, err := i.Next(preAlloced)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
} }
if next == nil { segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num)
return nil, nil if segIndex >= len(i.snapshot.segment) {
return nil, fmt.Errorf("computed segment index %d out of bounds %d",
segIndex, len(i.snapshot.segment))
} }
for bytes.Compare(next.ID, ID) < 0 { // skip directly to the target segment
next, err = i.Next(preAlloced) i.segmentOffset = segIndex
next, err := i.iterators[i.segmentOffset].Advance(ldocNum)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if next == nil { if next == nil {
break // we jumped directly to the segment that should have contained it
// but it wasn't there, so reuse Next() which should correctly
// get the next hit after it (we moved i.segmentOffset)
return i.Next(preAlloced)
} }
if preAlloced == nil {
preAlloced = &index.TermFieldDoc{}
} }
return next, nil preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
i.snapshot.offsets[segIndex])
i.postingToTermFieldDoc(next, preAlloced)
i.currID = preAlloced.ID
i.currPosting = next
return preAlloced, nil
} }
func (i *IndexSnapshotTermFieldReader) Count() uint64 { func (i *IndexSnapshotTermFieldReader) Count() uint64 {
@ -126,7 +178,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 {
func (i *IndexSnapshotTermFieldReader) Close() error { func (i *IndexSnapshotTermFieldReader) Close() error {
if i.snapshot != nil { if i.snapshot != nil {
atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1)) atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1))
i.snapshot.recycleTermFieldReader(i)
} }
return nil return nil
} }

@ -19,7 +19,7 @@ import (
"log" "log"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/boltdb/bolt" bolt "github.com/etcd-io/bbolt"
) )
type RollbackPoint struct { type RollbackPoint struct {

@ -15,42 +15,25 @@
package scorch package scorch
import ( import (
"bytes"
"sync" "sync"
"sync/atomic"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
) )
var TermSeparator byte = 0xff var TermSeparator byte = 0xff
var TermSeparatorSplitSlice = []byte{TermSeparator} var TermSeparatorSplitSlice = []byte{TermSeparator}
type SegmentDictionarySnapshot struct {
s *SegmentSnapshot
d segment.TermDictionary
}
func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
// TODO: if except is non-nil, perhaps need to OR it with s.s.deleted?
return s.d.PostingsList(term, s.s.deleted)
}
func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator {
return s.d.Iterator()
}
func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator {
return s.d.PrefixIterator(prefix)
}
func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator {
return s.d.RangeIterator(start, end)
}
type SegmentSnapshot struct { type SegmentSnapshot struct {
id uint64 id uint64
segment segment.Segment segment segment.Segment
deleted *roaring.Bitmap deleted *roaring.Bitmap
creator string
cachedDocs *cachedDocs cachedDocs *cachedDocs
} }
@ -83,8 +66,11 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel
return s.segment.VisitDocument(num, visitor) return s.segment.VisitDocument(num, visitor)
} }
func (s *SegmentSnapshot) Count() uint64 { func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) {
return s.segment.DocID(num)
}
func (s *SegmentSnapshot) Count() uint64 {
rv := s.segment.Count() rv := s.segment.Count()
if s.deleted != nil { if s.deleted != nil {
rv -= s.deleted.GetCardinality() rv -= s.deleted.GetCardinality()
@ -92,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 {
return rv return rv
} }
func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) {
d, err := s.segment.Dictionary(field)
if err != nil {
return nil, err
}
return &SegmentDictionarySnapshot{
s: s,
d: d,
}, nil
}
func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
rv, err := s.segment.DocNumbers(docIDs) rv, err := s.segment.DocNumbers(docIDs)
if err != nil { if err != nil {
@ -114,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
return rv, nil return rv, nil
} }
// DocNumbersLive returns bitsit containing doc numbers for all live docs // DocNumbersLive returns a bitmap containing doc numbers for all live docs
func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap { func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap {
rv := roaring.NewBitmap() rv := roaring.NewBitmap()
rv.AddRange(0, s.segment.Count()) rv.AddRange(0, s.segment.Count())
@ -128,36 +103,68 @@ func (s *SegmentSnapshot) Fields() []string {
return s.segment.Fields() return s.segment.Fields()
} }
func (s *SegmentSnapshot) Size() (rv int) {
rv = s.segment.Size()
if s.deleted != nil {
rv += int(s.deleted.GetSizeInBytes())
}
rv += s.cachedDocs.Size()
return
}
type cachedFieldDocs struct { type cachedFieldDocs struct {
m sync.Mutex
readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used. readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used.
err error // Non-nil if there was an error when preparing this cachedFieldDocs. err error // Non-nil if there was an error when preparing this cachedFieldDocs.
docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF.
size uint64
}
func (cfd *cachedFieldDocs) Size() int {
var rv int
cfd.m.Lock()
for _, entry := range cfd.docs {
rv += 8 /* size of uint64 */ + len(entry)
}
cfd.m.Unlock()
return rv
} }
func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) {
defer close(cfd.readyCh) cfd.m.Lock()
defer func() {
close(cfd.readyCh)
cfd.m.Unlock()
}()
cfd.size += uint64(size.SizeOfUint64) /* size field */
dict, err := ss.segment.Dictionary(field) dict, err := ss.segment.Dictionary(field)
if err != nil { if err != nil {
cfd.err = err cfd.err = err
return return
} }
var postings segment.PostingsList
var postingsItr segment.PostingsIterator
dictItr := dict.Iterator() dictItr := dict.Iterator()
next, err := dictItr.Next() next, err := dictItr.Next()
for err == nil && next != nil { for err == nil && next != nil {
postings, err1 := dict.PostingsList(next.Term, nil) var err1 error
postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings)
if err1 != nil { if err1 != nil {
cfd.err = err1 cfd.err = err1
return return
} }
postingsItr := postings.Iterator() cfd.size += uint64(size.SizeOfUint64) /* map key */
postingsItr = postings.Iterator(false, false, false, postingsItr)
nextPosting, err2 := postingsItr.Next() nextPosting, err2 := postingsItr.Next()
for err2 == nil && nextPosting != nil { for err2 == nil && nextPosting != nil {
docNum := nextPosting.Number() docNum := nextPosting.Number()
cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...)
cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator)
cfd.size += uint64(len(next.Term) + 1) // map value
nextPosting, err2 = postingsItr.Next() nextPosting, err2 = postingsItr.Next()
} }
@ -178,10 +185,12 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) {
type cachedDocs struct { type cachedDocs struct {
m sync.Mutex // As the cache is asynchronously prepared, need a lock m sync.Mutex // As the cache is asynchronously prepared, need a lock
cache map[string]*cachedFieldDocs // Keyed by field cache map[string]*cachedFieldDocs // Keyed by field
size uint64
} }
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error { func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {
c.m.Lock() c.m.Lock()
if c.cache == nil { if c.cache == nil {
c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields())) c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields()))
} }
@ -194,7 +203,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e
docs: make(map[uint64][]byte), docs: make(map[uint64][]byte),
} }
go c.cache[field].prepareFields(field, ss) go c.cache[field].prepareField(field, ss)
} }
} }
@ -209,21 +218,62 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e
c.m.Lock() c.m.Lock()
} }
c.updateSizeLOCKED()
c.m.Unlock() c.m.Unlock()
return nil return nil
} }
func (c *cachedDocs) sizeInBytes() uint64 { // hasFields returns true if the cache has all the given fields
sizeInBytes := 0 func (c *cachedDocs) hasFields(fields []string) bool {
c.m.Lock() c.m.Lock()
for _, field := range fields {
if _, exists := c.cache[field]; !exists {
c.m.Unlock()
return false // found a field not in cache
}
}
c.m.Unlock()
return true
}
func (c *cachedDocs) Size() int {
return int(atomic.LoadUint64(&c.size))
}
func (c *cachedDocs) updateSizeLOCKED() {
sizeInBytes := 0
for k, v := range c.cache { // cachedFieldDocs for k, v := range c.cache { // cachedFieldDocs
sizeInBytes += len(k) sizeInBytes += len(k)
if v != nil { if v != nil {
for _, entry := range v.docs { // docs sizeInBytes += v.Size()
sizeInBytes += 8 /* size of uint64 */ + len(entry)
} }
} }
atomic.StoreUint64(&c.size, uint64(sizeInBytes))
} }
func (c *cachedDocs) visitDoc(localDocNum uint64,
fields []string, visitor index.DocumentFieldTermVisitor) {
c.m.Lock()
for _, field := range fields {
if cachedFieldDocs, exists := c.cache[field]; exists {
c.m.Unlock()
<-cachedFieldDocs.readyCh
c.m.Lock()
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
for {
i := bytes.Index(tlist, TermSeparatorSplitSlice)
if i < 0 {
break
}
visitor(field, tlist[0:i])
tlist = tlist[i+1:]
}
}
}
}
c.m.Unlock() c.m.Unlock()
return uint64(sizeInBytes)
} }

@ -16,63 +16,125 @@ package scorch
import ( import (
"encoding/json" "encoding/json"
"io/ioutil" "reflect"
"sync/atomic" "sync/atomic"
) )
// Stats tracks statistics about the index // Stats tracks statistics about the index, fields that are
// prefixed like CurXxxx are gauges (can go up and down),
// and fields that are prefixed like TotXxxx are monotonically
// increasing counters.
type Stats struct { type Stats struct {
updates, deletes, batches, errors uint64 TotUpdates uint64
analysisTime, indexTime uint64 TotDeletes uint64
termSearchersStarted uint64
termSearchersFinished uint64
numPlainTextBytesIndexed uint64
numItemsIntroduced uint64
numItemsPersisted uint64
i *Scorch
}
func (s *Stats) statsMap() (map[string]interface{}, error) { TotBatches uint64
m := map[string]interface{}{} TotBatchesEmpty uint64
m["updates"] = atomic.LoadUint64(&s.updates) TotBatchIntroTime uint64
m["deletes"] = atomic.LoadUint64(&s.deletes) MaxBatchIntroTime uint64
m["batches"] = atomic.LoadUint64(&s.batches)
m["errors"] = atomic.LoadUint64(&s.errors)
m["analysis_time"] = atomic.LoadUint64(&s.analysisTime)
m["index_time"] = atomic.LoadUint64(&s.indexTime)
m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted)
m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished)
m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed)
m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced)
m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted)
if s.i.path != "" {
finfos, err := ioutil.ReadDir(s.i.path)
if err != nil {
return nil, err
}
var numFilesOnDisk, numBytesUsedDisk uint64 CurRootEpoch uint64
LastPersistedEpoch uint64
LastMergedEpoch uint64
for _, finfo := range finfos { TotOnErrors uint64
if !finfo.IsDir() {
numBytesUsedDisk += uint64(finfo.Size()) TotAnalysisTime uint64
numFilesOnDisk++ TotIndexTime uint64
}
} TotIndexedPlainTextBytes uint64
TotTermSearchersStarted uint64
TotTermSearchersFinished uint64
TotIntroduceLoop uint64
TotIntroduceSegmentBeg uint64
TotIntroduceSegmentEnd uint64
TotIntroducePersistBeg uint64
TotIntroducePersistEnd uint64
TotIntroduceMergeBeg uint64
TotIntroduceMergeEnd uint64
TotIntroduceRevertBeg uint64
TotIntroduceRevertEnd uint64
TotIntroducedItems uint64
TotIntroducedSegmentsBatch uint64
TotIntroducedSegmentsMerge uint64
TotPersistLoopBeg uint64
TotPersistLoopErr uint64
TotPersistLoopProgress uint64
TotPersistLoopWait uint64
TotPersistLoopWaitNotified uint64
TotPersistLoopEnd uint64
TotPersistedItems uint64
TotItemsToPersist uint64
TotPersistedSegments uint64
m["num_bytes_used_disk"] = numBytesUsedDisk TotPersisterSlowMergerPause uint64
m["num_files_on_disk"] = numFilesOnDisk TotPersisterSlowMergerResume uint64
TotPersisterNapPauseCompleted uint64
TotPersisterMergerNapBreak uint64
TotFileMergeLoopBeg uint64
TotFileMergeLoopErr uint64
TotFileMergeLoopEnd uint64
TotFileMergePlan uint64
TotFileMergePlanErr uint64
TotFileMergePlanNone uint64
TotFileMergePlanOk uint64
TotFileMergePlanTasks uint64
TotFileMergePlanTasksDone uint64
TotFileMergePlanTasksErr uint64
TotFileMergePlanTasksSegments uint64
TotFileMergePlanTasksSegmentsEmpty uint64
TotFileMergeSegmentsEmpty uint64
TotFileMergeSegments uint64
TotFileSegmentsAtRoot uint64
TotFileMergeWrittenBytes uint64
TotFileMergeZapBeg uint64
TotFileMergeZapEnd uint64
TotFileMergeZapTime uint64
MaxFileMergeZapTime uint64
TotFileMergeIntroductions uint64
TotFileMergeIntroductionsDone uint64
TotFileMergeIntroductionsSkipped uint64
TotMemMergeBeg uint64
TotMemMergeErr uint64
TotMemMergeDone uint64
TotMemMergeZapBeg uint64
TotMemMergeZapEnd uint64
TotMemMergeZapTime uint64
MaxMemMergeZapTime uint64
TotMemMergeSegments uint64
TotMemorySegmentsAtRoot uint64
} }
return m, nil // atomically populates the returned map
func (s *Stats) ToMap() map[string]interface{} {
m := map[string]interface{}{}
sve := reflect.ValueOf(s).Elem()
svet := sve.Type()
for i := 0; i < svet.NumField(); i++ {
svef := sve.Field(i)
if svef.CanAddr() {
svefp := svef.Addr().Interface()
m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64))
}
}
return m
} }
// MarshalJSON implements json.Marshaler // MarshalJSON implements json.Marshaler, and in contrast to standard
// json marshaling provides atomic safety
func (s *Stats) MarshalJSON() ([]byte, error) { func (s *Stats) MarshalJSON() ([]byte, error) {
m, err := s.statsMap() return json.Marshal(s.ToMap())
if err != nil {
return nil, err
}
return json.Marshal(m)
} }

@ -17,7 +17,7 @@ package boltdb
import ( import (
"bytes" "bytes"
"github.com/boltdb/bolt" bolt "github.com/etcd-io/bbolt"
) )
type Iterator struct { type Iterator struct {

@ -16,7 +16,7 @@ package boltdb
import ( import (
"github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store"
"github.com/boltdb/bolt" bolt "github.com/etcd-io/bbolt"
) )
type Reader struct { type Reader struct {

@ -30,7 +30,7 @@ import (
"github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
"github.com/boltdb/bolt" bolt "github.com/etcd-io/bbolt"
) )
const ( const (
@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore,
bo.ReadOnly = ro bo.ReadOnly = ro
} }
if initialMmapSize, ok := config["initialMmapSize"].(int); ok {
bo.InitialMmapSize = initialMmapSize
} else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok {
bo.InitialMmapSize = int(initialMmapSize)
}
db, err := bolt.Open(path, 0600, bo) db, err := bolt.Open(path, 0600, bo)
if err != nil { if err != nil {
return nil, err return nil, err

@ -15,11 +15,20 @@
package upsidedown package upsidedown
import ( import (
"reflect"
"github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store"
) )
var reflectStaticSizeIndexReader int
func init() {
var ir IndexReader
reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size())
}
type IndexReader struct { type IndexReader struct {
index *UpsideDownCouch index *UpsideDownCouch
kvreader store.KVReader kvreader store.KVReader
@ -201,3 +210,17 @@ func incrementBytes(in []byte) []byte {
} }
return rv return rv
} }
func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) {
return &DocValueReader{i: i, fields: fields}, nil
}
type DocValueReader struct {
i *IndexReader
fields []string
}
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
visitor index.DocumentFieldTermVisitor) error {
return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor)
}

@ -16,13 +16,27 @@ package upsidedown
import ( import (
"bytes" "bytes"
"reflect"
"sort" "sort"
"sync/atomic" "sync/atomic"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeUpsideDownCouchTermFieldReader int
var reflectStaticSizeUpsideDownCouchDocIDReader int
func init() {
var tfr UpsideDownCouchTermFieldReader
reflectStaticSizeUpsideDownCouchTermFieldReader =
int(reflect.TypeOf(tfr).Size())
var cdr UpsideDownCouchDocIDReader
reflectStaticSizeUpsideDownCouchDocIDReader =
int(reflect.TypeOf(cdr).Size())
}
type UpsideDownCouchTermFieldReader struct { type UpsideDownCouchTermFieldReader struct {
count uint64 count uint64
indexReader *IndexReader indexReader *IndexReader
@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct {
includeTermVectors bool includeTermVectors bool
} }
func (r *UpsideDownCouchTermFieldReader) Size() int {
sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr +
len(r.term) +
r.tfrPrealloc.Size() +
len(r.keyBuf)
if r.tfrNext != nil {
sizeInBytes += r.tfrNext.Size()
}
return sizeInBytes
}
func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) { func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) {
bufNeeded := termFrequencyRowKeySize(term, nil) bufNeeded := termFrequencyRowKeySize(term, nil)
if bufNeeded < dictionaryRowKeySize(term) { if bufNeeded < dictionaryRowKeySize(term) {
@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct {
onlyMode bool onlyMode bool
} }
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { func (r *UpsideDownCouchDocIDReader) Size() int {
sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader +
reflectStaticSizeIndexReader + size.SizeOfPtr
for _, entry := range r.only {
sizeInBytes += size.SizeOfString + len(entry)
}
return sizeInBytes
}
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
startBytes := []byte{0x0} startBytes := []byte{0x0}
endBytes := []byte{0xff} endBytes := []byte{0xff}

@ -20,10 +20,22 @@ import (
"fmt" "fmt"
"io" "io"
"math" "math"
"reflect"
"github.com/blevesearch/bleve/size"
"github.com/golang/protobuf/proto" "github.com/golang/protobuf/proto"
) )
var reflectStaticSizeTermFrequencyRow int
var reflectStaticSizeTermVector int
func init() {
var tfr TermFrequencyRow
reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size())
var tv TermVector
reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size())
}
const ByteSeparator byte = 0xff const ByteSeparator byte = 0xff
type UpsideDownCouchRowStream chan UpsideDownCouchRow type UpsideDownCouchRowStream chan UpsideDownCouchRow
@ -358,6 +370,11 @@ type TermVector struct {
end uint64 end uint64
} }
func (tv *TermVector) Size() int {
return reflectStaticSizeTermVector + size.SizeOfPtr +
len(tv.arrayPositions)*size.SizeOfUint64
}
func (tv *TermVector) String() string { func (tv *TermVector) String() string {
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions) return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
} }
@ -371,6 +388,18 @@ type TermFrequencyRow struct {
field uint16 field uint16
} }
func (tfr *TermFrequencyRow) Size() int {
sizeInBytes := reflectStaticSizeTermFrequencyRow +
len(tfr.term) +
len(tfr.doc)
for _, entry := range tfr.vectors {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
func (tfr *TermFrequencyRow) Term() []byte { func (tfr *TermFrequencyRow) Term() []byte {
return tfr.term return tfr.term
} }
@ -555,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error {
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error { func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
tfr.doc = key[3+len(term)+1:] tfr.doc = key[3+len(term)+1:]
if len(tfr.doc) <= 0 { if len(tfr.doc) == 0 {
return fmt.Errorf("invalid term frequency key, empty docid") return fmt.Errorf("invalid term frequency key, empty docid")
} }

@ -775,7 +775,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
} }
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector { func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
if len(in) <= 0 { if len(in) == 0 {
return nil return nil
} }
@ -810,6 +810,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
} }
} }
if len(batch.IndexOps) > 0 {
go func() { go func() {
for _, doc := range batch.IndexOps { for _, doc := range batch.IndexOps {
if doc != nil { if doc != nil {
@ -819,6 +820,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
} }
} }
}() }()
}
// retrieve back index rows concurrent with analysis // retrieve back index rows concurrent with analysis
docBackIndexRowErr := error(nil) docBackIndexRowErr := error(nil)
@ -958,6 +960,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
} else { } else {
atomic.AddUint64(&udc.stats.errors, 1) atomic.AddUint64(&udc.stats.errors, 1)
} }
persistedCallback := batch.PersistedCallback()
if persistedCallback != nil {
persistedCallback(err)
}
return return
} }

@ -433,6 +433,7 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest {
Explain: req.Explain, Explain: req.Explain,
Sort: req.Sort.Copy(), Sort: req.Sort.Copy(),
IncludeLocations: req.IncludeLocations, IncludeLocations: req.IncludeLocations,
Score: req.Score,
} }
return &rv return &rv
} }

@ -50,6 +50,12 @@ const storePath = "store"
var mappingInternalKey = []byte("_mapping") var mappingInternalKey = []byte("_mapping")
const SearchQueryStartCallbackKey = "_search_query_start_callback_key"
const SearchQueryEndCallbackKey = "_search_query_end_callback_key"
type SearchQueryStartCallbackFn func(size uint64) error
type SearchQueryEndCallbackFn func(size uint64) error
func indexStorePath(path string) string { func indexStorePath(path string) string {
return path + string(os.PathSeparator) + storePath return path + string(os.PathSeparator) + storePath
} }
@ -362,6 +368,68 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) {
return i.SearchInContext(context.Background(), req) return i.SearchInContext(context.Background(), req)
} }
var documentMatchEmptySize int
var searchContextEmptySize int
var facetResultEmptySize int
var documentEmptySize int
func init() {
var dm search.DocumentMatch
documentMatchEmptySize = dm.Size()
var sc search.SearchContext
searchContextEmptySize = sc.Size()
var fr search.FacetResult
facetResultEmptySize = fr.Size()
var d document.Document
documentEmptySize = d.Size()
}
// memNeededForSearch is a helper function that returns an estimate of RAM
// needed to execute a search request.
func memNeededForSearch(req *SearchRequest,
searcher search.Searcher,
topnCollector *collector.TopNCollector) uint64 {
backingSize := req.Size + req.From + 1
if req.Size+req.From > collector.PreAllocSizeSkipCap {
backingSize = collector.PreAllocSizeSkipCap + 1
}
numDocMatches := backingSize + searcher.DocumentMatchPoolSize()
estimate := 0
// overhead, size in bytes from collector
estimate += topnCollector.Size()
// pre-allocing DocumentMatchPool
estimate += searchContextEmptySize + numDocMatches*documentMatchEmptySize
// searcher overhead
estimate += searcher.Size()
// overhead from results, lowestMatchOutsideResults
estimate += (numDocMatches + 1) * documentMatchEmptySize
// additional overhead from SearchResult
estimate += reflectStaticSizeSearchResult + reflectStaticSizeSearchStatus
// overhead from facet results
if req.Facets != nil {
estimate += len(req.Facets) * facetResultEmptySize
}
// highlighting, store
if len(req.Fields) > 0 || req.Highlight != nil {
// Size + From => number of hits
estimate += (req.Size + req.From) * documentEmptySize
}
return uint64(estimate)
}
// SearchInContext executes a search request operation within the provided // SearchInContext executes a search request operation within the provided
// Context. Returns a SearchResult object or an error. // Context. Returns a SearchResult object or an error.
func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) { func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) {
@ -390,6 +458,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{ searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{
Explain: req.Explain, Explain: req.Explain,
IncludeTermVectors: req.IncludeLocations || req.Highlight != nil, IncludeTermVectors: req.IncludeLocations || req.Highlight != nil,
Score: req.Score,
}) })
if err != nil { if err != nil {
return nil, err return nil, err
@ -428,6 +497,24 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
collector.SetFacetsBuilder(facetsBuilder) collector.SetFacetsBuilder(facetsBuilder)
} }
memNeeded := memNeededForSearch(req, searcher, collector)
if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil {
if cbF, ok := cb.(SearchQueryStartCallbackFn); ok {
err = cbF(memNeeded)
}
}
if err != nil {
return nil, err
}
if cb := ctx.Value(SearchQueryEndCallbackKey); cb != nil {
if cbF, ok := cb.(SearchQueryEndCallbackFn); ok {
defer func() {
_ = cbF(memNeeded)
}()
}
}
err = collector.Collect(ctx, searcher, indexReader) err = collector.Collect(ctx, searcher, indexReader)
if err != nil { if err != nil {
return nil, err return nil, err
@ -459,7 +546,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
doc, err := indexReader.Document(hit.ID) doc, err := indexReader.Document(hit.ID)
if err == nil && doc != nil { if err == nil && doc != nil {
if len(req.Fields) > 0 { if len(req.Fields) > 0 {
for _, f := range req.Fields { fieldsToLoad := deDuplicate(req.Fields)
for _, f := range fieldsToLoad {
for _, docF := range doc.Fields { for _, docF := range doc.Fields {
if f == "*" || docF.Name() == f { if f == "*" || docF.Name() == f {
var value interface{} var value interface{}
@ -533,9 +621,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
return &SearchResult{ return &SearchResult{
Status: &SearchStatus{ Status: &SearchStatus{
Total: 1, Total: 1,
Failed: 0,
Successful: 1, Successful: 1,
Errors: make(map[string]error),
}, },
Request: req, Request: req,
Hits: hits, Hits: hits,
@ -755,3 +841,16 @@ func (f *indexImplFieldDict) Close() error {
} }
return f.indexReader.Close() return f.indexReader.Close()
} }
// helper function to remove duplicate entries from slice of strings
func deDuplicate(fields []string) []string {
entries := make(map[string]struct{})
ret := []string{}
for _, entry := range fields {
if _, exists := entries[entry]; !exists {
entries[entry] = struct{}{}
ret = append(ret, entry)
}
}
return ret
}

@ -18,6 +18,7 @@ import (
"encoding/json" "encoding/json"
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath"
"github.com/blevesearch/bleve/index/upsidedown" "github.com/blevesearch/bleve/index/upsidedown"
) )
@ -92,5 +93,5 @@ func (i *indexMeta) Save(path string) (err error) {
} }
func indexMetaPath(path string) string { func indexMetaPath(path string) string {
return path + string(os.PathSeparator) + metaFilename return filepath.Join(path, metaFilename)
} }

@ -42,7 +42,7 @@ type DocumentMapping struct {
Dynamic bool `json:"dynamic"` Dynamic bool `json:"dynamic"`
Properties map[string]*DocumentMapping `json:"properties,omitempty"` Properties map[string]*DocumentMapping `json:"properties,omitempty"`
Fields []*FieldMapping `json:"fields,omitempty"` Fields []*FieldMapping `json:"fields,omitempty"`
DefaultAnalyzer string `json:"default_analyzer"` DefaultAnalyzer string `json:"default_analyzer,omitempty"`
// StructTagKey overrides "json" when looking for field names in struct tags // StructTagKey overrides "json" when looking for field names in struct tags
StructTagKey string `json:"struct_tag_key,omitempty"` StructTagKey string `json:"struct_tag_key,omitempty"`
@ -324,13 +324,17 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string {
} }
func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) {
// allow default "json" tag to be overriden // allow default "json" tag to be overridden
structTagKey := dm.StructTagKey structTagKey := dm.StructTagKey
if structTagKey == "" { if structTagKey == "" {
structTagKey = "json" structTagKey = "json"
} }
val := reflect.ValueOf(data) val := reflect.ValueOf(data)
if !val.IsValid() {
return
}
typ := val.Type() typ := val.Type()
switch typ.Kind() { switch typ.Kind() {
case reflect.Map: case reflect.Map:
@ -420,8 +424,12 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
if subDocMapping != nil { if subDocMapping != nil {
// index by explicit mapping // index by explicit mapping
for _, fieldMapping := range subDocMapping.Fields { for _, fieldMapping := range subDocMapping.Fields {
if fieldMapping.Type == "geopoint" {
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
} else {
fieldMapping.processString(propertyValueString, pathString, path, indexes, context) fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
} }
}
} else if closestDocMapping.Dynamic { } else if closestDocMapping.Dynamic {
// automatic indexing behavior // automatic indexing behavior

@ -320,8 +320,8 @@ func (im *IndexMappingImpl) determineType(data interface{}) string {
func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error { func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error {
docType := im.determineType(data) docType := im.determineType(data)
docMapping := im.mappingForType(docType) docMapping := im.mappingForType(docType)
walkContext := im.newWalkContext(doc, docMapping)
if docMapping.Enabled { if docMapping.Enabled {
walkContext := im.newWalkContext(doc, docMapping)
docMapping.walkDocument(data, []string{}, []uint64{}, walkContext) docMapping.walkDocument(data, []string{}, []uint64{}, walkContext)
// see if the _all field was disabled // see if the _all field was disabled

@ -35,6 +35,9 @@ func lookupPropertyPath(data interface{}, path string) interface{} {
func lookupPropertyPathPart(data interface{}, part string) interface{} { func lookupPropertyPathPart(data interface{}, part string) interface{} {
val := reflect.ValueOf(data) val := reflect.ValueOf(data)
if !val.IsValid() {
return nil
}
typ := val.Type() typ := val.Type()
switch typ.Kind() { switch typ.Kind() {
case reflect.Map: case reflect.Map:

@ -14,7 +14,7 @@ var interleaveShift = []uint{1, 2, 4, 8, 16}
// Interleave the first 32 bits of each uint64 // Interleave the first 32 bits of each uint64
// apdated from org.apache.lucene.util.BitUtil // apdated from org.apache.lucene.util.BitUtil
// whcih was adapted from: // which was adapted from:
// http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN // http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
func Interleave(v1, v2 uint64) uint64 { func Interleave(v1, v2 uint64) uint64 {
v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4] v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4]

@ -77,6 +77,10 @@ func (p PrefixCoded) Int64() (int64, error) {
} }
func ValidPrefixCodedTerm(p string) (bool, int) { func ValidPrefixCodedTerm(p string) (bool, int) {
return ValidPrefixCodedTermBytes([]byte(p))
}
func ValidPrefixCodedTermBytes(p []byte) (bool, int) {
if len(p) > 0 { if len(p) > 0 {
if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 { if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 {
return false, 0 return false, 0

@ -17,15 +17,29 @@ package bleve
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"reflect"
"time" "time"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/datetime/optional" "github.com/blevesearch/bleve/analysis/datetime/optional"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/collector"
"github.com/blevesearch/bleve/search/query" "github.com/blevesearch/bleve/search/query"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeSearchResult int
var reflectStaticSizeSearchStatus int
func init() {
var sr SearchResult
reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size())
var ss SearchStatus
reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size())
}
var cache = registry.NewCache() var cache = registry.NewCache()
const defaultDateTimeParser = optional.Name const defaultDateTimeParser = optional.Name
@ -247,6 +261,7 @@ func (h *HighlightRequest) AddField(field string) {
// Explain triggers inclusion of additional search // Explain triggers inclusion of additional search
// result score explanations. // result score explanations.
// Sort describes the desired order for the results to be returned. // Sort describes the desired order for the results to be returned.
// Score controls the kind of scoring performed
// //
// A special field named "*" can be used to return all fields. // A special field named "*" can be used to return all fields.
type SearchRequest struct { type SearchRequest struct {
@ -259,6 +274,7 @@ type SearchRequest struct {
Explain bool `json:"explain"` Explain bool `json:"explain"`
Sort search.SortOrder `json:"sort"` Sort search.SortOrder `json:"sort"`
IncludeLocations bool `json:"includeLocations"` IncludeLocations bool `json:"includeLocations"`
Score string `json:"score,omitempty"`
} }
func (r *SearchRequest) Validate() error { func (r *SearchRequest) Validate() error {
@ -308,6 +324,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error {
Explain bool `json:"explain"` Explain bool `json:"explain"`
Sort []json.RawMessage `json:"sort"` Sort []json.RawMessage `json:"sort"`
IncludeLocations bool `json:"includeLocations"` IncludeLocations bool `json:"includeLocations"`
Score string `json:"score"`
} }
err := json.Unmarshal(input, &temp) err := json.Unmarshal(input, &temp)
@ -334,6 +351,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error {
r.Fields = temp.Fields r.Fields = temp.Fields
r.Facets = temp.Facets r.Facets = temp.Facets
r.IncludeLocations = temp.IncludeLocations r.IncludeLocations = temp.IncludeLocations
r.Score = temp.Score
r.Query, err = query.ParseQuery(temp.Q) r.Query, err = query.ParseQuery(temp.Q)
if err != nil { if err != nil {
return err return err
@ -432,6 +450,24 @@ type SearchResult struct {
Facets search.FacetResults `json:"facets"` Facets search.FacetResults `json:"facets"`
} }
func (sr *SearchResult) Size() int {
sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr +
reflectStaticSizeSearchStatus
for _, entry := range sr.Hits {
if entry != nil {
sizeInBytes += entry.Size()
}
}
for k, v := range sr.Facets {
sizeInBytes += size.SizeOfString + len(k) +
v.Size()
}
return sizeInBytes
}
func (sr *SearchResult) String() string { func (sr *SearchResult) String() string {
rv := "" rv := ""
if sr.Total > 0 { if sr.Total > 0 {
@ -488,3 +524,44 @@ func (sr *SearchResult) Merge(other *SearchResult) {
sr.Facets.Merge(other.Facets) sr.Facets.Merge(other.Facets)
} }
// MemoryNeededForSearchResult is an exported helper function to determine the RAM
// needed to accommodate the results for a given search request.
func MemoryNeededForSearchResult(req *SearchRequest) uint64 {
if req == nil {
return 0
}
numDocMatches := req.Size + req.From
if req.Size+req.From > collector.PreAllocSizeSkipCap {
numDocMatches = collector.PreAllocSizeSkipCap
}
estimate := 0
// overhead from the SearchResult structure
var sr SearchResult
estimate += sr.Size()
var dm search.DocumentMatch
sizeOfDocumentMatch := dm.Size()
// overhead from results
estimate += numDocMatches * sizeOfDocumentMatch
// overhead from facet results
if req.Facets != nil {
var fr search.FacetResult
estimate += len(req.Facets) * fr.Size()
}
// highlighting, store
var d document.Document
if len(req.Fields) > 0 || req.Highlight != nil {
for i := 0; i < (req.Size + req.From); i++ {
estimate += (req.Size + req.From) * d.Size()
}
}
return uint64(estimate)
}

@ -30,3 +30,23 @@ type Collector interface {
SetFacetsBuilder(facetsBuilder *FacetsBuilder) SetFacetsBuilder(facetsBuilder *FacetsBuilder)
FacetResults() FacetResults FacetResults() FacetResults
} }
// DocumentMatchHandler is the type of document match callback
// bleve will invoke during the search.
// Eventually, bleve will indicate the completion of an ongoing search,
// by passing a nil value for the document match callback.
// The application should take a copy of the hit/documentMatch
// if it wish to own it or need prolonged access to it.
type DocumentMatchHandler func(hit *DocumentMatch) error
type MakeDocumentMatchHandlerKeyType string
var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType(
"MakeDocumentMatchHandlerKey")
// MakeDocumentMatchHandler is an optional DocumentMatchHandler
// builder function which the applications can pass to bleve.
// These builder methods gives a DocumentMatchHandler function
// to bleve, which it will invoke on every document matches.
type MakeDocumentMatchHandler func(ctx *SearchContext) (
callback DocumentMatchHandler, loadID bool, err error)

@ -25,9 +25,9 @@ type collectStoreHeap struct {
compare collectorCompare compare collectorCompare
} }
func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap { func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap {
rv := &collectStoreHeap{ rv := &collectStoreHeap{
heap: make(search.DocumentMatchCollection, 0, cap), heap: make(search.DocumentMatchCollection, 0, capacity),
compare: compare, compare: compare,
} }
heap.Init(rv) heap.Init(rv)

@ -25,7 +25,7 @@ type collectStoreList struct {
compare collectorCompare compare collectorCompare
} }
func newStoreList(cap int, compare collectorCompare) *collectStoreList { func newStoreList(capacity int, compare collectorCompare) *collectStoreList {
rv := &collectStoreList{ rv := &collectStoreList{
results: list.New(), results: list.New(),
compare: compare, compare: compare,
@ -34,8 +34,7 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList {
return rv return rv
} }
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch {
size int) *search.DocumentMatch {
c.add(doc) c.add(doc)
if c.len() > size { if c.len() > size {
return c.removeLast() return c.removeLast()

@ -21,9 +21,9 @@ type collectStoreSlice struct {
compare collectorCompare compare collectorCompare
} }
func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice { func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice {
rv := &collectStoreSlice{ rv := &collectStoreSlice{
slice: make(search.DocumentMatchCollection, 0, cap), slice: make(search.DocumentMatchCollection, 0, capacity),
compare: compare, compare: compare,
} }
return rv return rv

@ -16,12 +16,21 @@ package collector
import ( import (
"context" "context"
"reflect"
"time" "time"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeTopNCollector int
func init() {
var coll TopNCollector
reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size())
}
type collectorStore interface { type collectorStore interface {
// Add the document, and if the new store size exceeds the provided size // Add the document, and if the new store size exceeds the provided size
// the last element is removed and returned. If the size has not been // the last element is removed and returned. If the size has not been
@ -58,6 +67,8 @@ type TopNCollector struct {
cachedDesc []bool cachedDesc []bool
lowestMatchOutsideResults *search.DocumentMatch lowestMatchOutsideResults *search.DocumentMatch
updateFieldVisitor index.DocumentFieldTermVisitor
dvReader index.DocValueReader
} }
// CheckDoneEvery controls how frequently we check the context deadline // CheckDoneEvery controls how frequently we check the context deadline
@ -98,6 +109,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector
return hc return hc
} }
func (hc *TopNCollector) Size() int {
sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr
if hc.facetsBuilder != nil {
sizeInBytes += hc.facetsBuilder.Size()
}
for _, entry := range hc.neededFields {
sizeInBytes += len(entry) + size.SizeOfString
}
sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc)
return sizeInBytes
}
// Collect goes to the index to find the matching documents // Collect goes to the index to find the matching documents
func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
startTime := time.Now() startTime := time.Now()
@ -113,8 +140,34 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
} }
searchContext := &search.SearchContext{ searchContext := &search.SearchContext{
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
Collector: hc,
}
hc.dvReader, err = reader.DocValueReader(hc.neededFields)
if err != nil {
return err
}
hc.updateFieldVisitor = func(field string, term []byte) {
if hc.facetsBuilder != nil {
hc.facetsBuilder.UpdateVisitor(field, term)
}
hc.sort.UpdateVisitor(field, term)
}
dmHandlerMaker := MakeTopNDocumentMatchHandler
if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil {
dmHandlerMaker = cv.(search.MakeDocumentMatchHandler)
}
// use the application given builder for making the custom document match
// handler and perform callbacks/invocations on the newly made handler.
dmHandler, loadID, err := dmHandlerMaker(searchContext)
if err != nil {
return err
} }
hc.needDocIds = hc.needDocIds || loadID
select { select {
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
@ -130,13 +183,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
} }
} }
err = hc.collectSingle(searchContext, reader, next) err = hc.prepareDocumentMatch(searchContext, reader, next)
if err != nil {
break
}
err = dmHandler(next)
if err != nil { if err != nil {
break break
} }
next, err = searcher.Next(searchContext) next, err = searcher.Next(searchContext)
} }
// help finalize/flush the results in case
// of custom document match handlers.
err = dmHandler(nil)
if err != nil {
return err
}
// compute search duration // compute search duration
hc.took = time.Since(startTime) hc.took = time.Since(startTime)
if err != nil { if err != nil {
@ -152,8 +218,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
var sortByScoreOpt = []string{"_score"} var sortByScoreOpt = []string{"_score"}
func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error { func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
var err error reader index.IndexReader, d *search.DocumentMatch) (err error) {
// visit field terms for features that require it (sort, facets) // visit field terms for features that require it (sort, facets)
if len(hc.neededFields) > 0 { if len(hc.neededFields) > 0 {
@ -187,11 +253,24 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
hc.sort.Value(d) hc.sort.Value(d)
} }
return nil
}
func MakeTopNDocumentMatchHandler(
ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) {
var hc *TopNCollector
var ok bool
if hc, ok = ctx.Collector.(*TopNCollector); ok {
return func(d *search.DocumentMatch) error {
if d == nil {
return nil
}
// optimization, we track lowest sorting hit already removed from heap // optimization, we track lowest sorting hit already removed from heap
// with this one comparison, we can avoid all heap operations if // with this one comparison, we can avoid all heap operations if
// this hit would have been added and then immediately removed // this hit would have been added and then immediately removed
if hc.lowestMatchOutsideResults != nil { if hc.lowestMatchOutsideResults != nil {
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults) cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d,
hc.lowestMatchOutsideResults)
if cmp >= 0 { if cmp >= 0 {
// this hit can't possibly be in the result set, so avoid heap ops // this hit can't possibly be in the result set, so avoid heap ops
ctx.DocumentMatchPool.Put(d) ctx.DocumentMatchPool.Put(d)
@ -204,7 +283,8 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
if hc.lowestMatchOutsideResults == nil { if hc.lowestMatchOutsideResults == nil {
hc.lowestMatchOutsideResults = removed hc.lowestMatchOutsideResults = removed
} else { } else {
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults) cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc,
removed, hc.lowestMatchOutsideResults)
if cmp < 0 { if cmp < 0 {
tmp := hc.lowestMatchOutsideResults tmp := hc.lowestMatchOutsideResults
hc.lowestMatchOutsideResults = removed hc.lowestMatchOutsideResults = removed
@ -212,8 +292,10 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
} }
} }
} }
return nil return nil
}, false, nil
}
return nil, false, nil
} }
// visitFieldTerms is responsible for visiting the field terms of the // visitFieldTerms is responsible for visiting the field terms of the
@ -223,13 +305,7 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc
hc.facetsBuilder.StartDoc() hc.facetsBuilder.StartDoc()
} }
err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) { err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor)
if hc.facetsBuilder != nil {
hc.facetsBuilder.UpdateVisitor(field, term)
}
hc.sort.UpdateVisitor(field, term)
})
if hc.facetsBuilder != nil { if hc.facetsBuilder != nil {
hc.facetsBuilder.EndDoc() hc.facetsBuilder.EndDoc()
} }
@ -257,6 +333,7 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error {
return err return err
} }
} }
doc.Complete(nil)
return nil return nil
}) })
@ -288,5 +365,5 @@ func (hc *TopNCollector) FacetResults() search.FacetResults {
if hc.facetsBuilder != nil { if hc.facetsBuilder != nil {
return hc.facetsBuilder.Results() return hc.facetsBuilder.Results()
} }
return search.FacetResults{} return nil
} }

@ -17,8 +17,18 @@ package search
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeExplanation int
func init() {
var e Explanation
reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size())
}
type Explanation struct { type Explanation struct {
Value float64 `json:"value"` Value float64 `json:"value"`
Message string `json:"message"` Message string `json:"message"`
@ -32,3 +42,14 @@ func (expl *Explanation) String() string {
} }
return string(js) return string(js)
} }
func (expl *Explanation) Size() int {
sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr +
len(expl.Message)
for _, entry := range expl.Children {
sizeInBytes += entry.Size()
}
return sizeInBytes
}

@ -15,13 +15,25 @@
package facet package facet
import ( import (
"reflect"
"sort" "sort"
"time" "time"
"github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeDateTimeFacetBuilder int
var reflectStaticSizedateTimeRange int
func init() {
var dtfb DateTimeFacetBuilder
reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size())
var dtr dateTimeRange
reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size())
}
type dateTimeRange struct { type dateTimeRange struct {
start time.Time start time.Time
end time.Time end time.Time
@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder {
} }
} }
func (fb *DateTimeFacetBuilder) Size() int {
sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr +
len(fb.field)
for k, _ := range fb.termsCount {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfInt
}
for k, _ := range fb.ranges {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfPtr + reflectStaticSizedateTimeRange
}
return sizeInBytes
}
func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) { func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) {
r := dateTimeRange{ r := dateTimeRange{
start: start, start: start,

@ -15,12 +15,24 @@
package facet package facet
import ( import (
"reflect"
"sort" "sort"
"github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeNumericFacetBuilder int
var reflectStaticSizenumericRange int
func init() {
var nfb NumericFacetBuilder
reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size())
var nr numericRange
reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size())
}
type numericRange struct { type numericRange struct {
min *float64 min *float64
max *float64 max *float64
@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder {
} }
} }
func (fb *NumericFacetBuilder) Size() int {
sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr +
len(fb.field)
for k, _ := range fb.termsCount {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfInt
}
for k, _ := range fb.ranges {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfPtr + reflectStaticSizenumericRange
}
return sizeInBytes
}
func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) { func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) {
r := numericRange{ r := numericRange{
min: min, min: min,

@ -15,11 +15,20 @@
package facet package facet
import ( import (
"reflect"
"sort" "sort"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeTermsFacetBuilder int
func init() {
var tfb TermsFacetBuilder
reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size())
}
type TermsFacetBuilder struct { type TermsFacetBuilder struct {
size int size int
field string field string
@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder {
} }
} }
func (fb *TermsFacetBuilder) Size() int {
sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr +
len(fb.field)
for k, _ := range fb.termsCount {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfInt
}
return sizeInBytes
}
func (fb *TermsFacetBuilder) Field() string { func (fb *TermsFacetBuilder) Field() string {
return fb.field return fb.field
} }

@ -15,11 +15,32 @@
package search package search
import ( import (
"reflect"
"sort" "sort"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeFacetsBuilder int
var reflectStaticSizeFacetResult int
var reflectStaticSizeTermFacet int
var reflectStaticSizeNumericRangeFacet int
var reflectStaticSizeDateRangeFacet int
func init() {
var fb FacetsBuilder
reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size())
var fr FacetResult
reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size())
var tf TermFacet
reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size())
var nrf NumericRangeFacet
reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size())
var drf DateRangeFacet
reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size())
}
type FacetBuilder interface { type FacetBuilder interface {
StartDoc() StartDoc()
UpdateVisitor(field string, term []byte) UpdateVisitor(field string, term []byte)
@ -27,23 +48,40 @@ type FacetBuilder interface {
Result() *FacetResult Result() *FacetResult
Field() string Field() string
Size() int
} }
type FacetsBuilder struct { type FacetsBuilder struct {
indexReader index.IndexReader indexReader index.IndexReader
facets map[string]FacetBuilder facetNames []string
facets []FacetBuilder
fields []string fields []string
} }
func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder { func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder {
return &FacetsBuilder{ return &FacetsBuilder{
indexReader: indexReader, indexReader: indexReader,
facets: make(map[string]FacetBuilder, 0),
} }
} }
func (fb *FacetsBuilder) Size() int {
sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr
for k, v := range fb.facets {
sizeInBytes += size.SizeOfString + v.Size() + len(fb.facetNames[k])
}
for _, entry := range fb.fields {
sizeInBytes += size.SizeOfString + len(entry)
}
return sizeInBytes
}
func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) { func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) {
fb.facets[name] = facetBuilder fb.facetNames = append(fb.facetNames, name)
fb.facets = append(fb.facets, facetBuilder)
fb.fields = append(fb.fields, facetBuilder.Field()) fb.fields = append(fb.fields, facetBuilder.Field())
} }
@ -213,6 +251,14 @@ type FacetResult struct {
DateRanges DateRangeFacets `json:"date_ranges,omitempty"` DateRanges DateRangeFacets `json:"date_ranges,omitempty"`
} }
func (fr *FacetResult) Size() int {
return reflectStaticSizeFacetResult + size.SizeOfPtr +
len(fr.Field) +
len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) +
len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) +
len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr)
}
func (fr *FacetResult) Merge(other *FacetResult) { func (fr *FacetResult) Merge(other *FacetResult) {
fr.Total += other.Total fr.Total += other.Total
fr.Missing += other.Missing fr.Missing += other.Missing
@ -287,9 +333,9 @@ func (fr FacetResults) Fixup(name string, size int) {
func (fb *FacetsBuilder) Results() FacetResults { func (fb *FacetsBuilder) Results() FacetResults {
fr := make(FacetResults) fr := make(FacetResults)
for facetName, facetBuilder := range fb.facets { for i, facetBuilder := range fb.facets {
facetResult := facetBuilder.Result() facetResult := facetBuilder.Result()
fr[facetName] = facetResult fr[fb.facetNames[i]] = facetResult
} }
return fr return fr
} }

@ -57,15 +57,24 @@ func LevenshteinDistance(a, b string) int {
// in which case the first return val will be the max // in which case the first return val will be the max
// and the second will be true, indicating max was exceeded // and the second will be true, indicating max was exceeded
func LevenshteinDistanceMax(a, b string, max int) (int, bool) { func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
v, wasMax, _ := LevenshteinDistanceMaxReuseSlice(a, b, max, nil)
return v, wasMax
}
func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, []int) {
la := len(a) la := len(a)
lb := len(b) lb := len(b)
ld := int(math.Abs(float64(la - lb))) ld := int(math.Abs(float64(la - lb)))
if ld > max { if ld > max {
return max, true return max, true, d
} }
d := make([]int, la+1) if cap(d) < la+1 {
d = make([]int, la+1)
}
d = d[:la+1]
var lastdiag, olddiag, temp int var lastdiag, olddiag, temp int
for i := 1; i <= la; i++ { for i := 1; i <= la; i++ {
@ -98,8 +107,8 @@ func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
} }
// after each row if rowmin isn't less than max stop // after each row if rowmin isn't less than max stop
if rowmin > max { if rowmin > max {
return max, true return max, true, d
} }
} }
return d[la], false return d[la], false, d
} }

@ -14,6 +14,17 @@
package search package search
import (
"reflect"
)
var reflectStaticSizeDocumentMatchPool int
func init() {
var dmp DocumentMatchPool
reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size())
}
// DocumentMatchPoolTooSmall is a callback function that can be executed // DocumentMatchPoolTooSmall is a callback function that can be executed
// when the DocumentMatchPool does not have sufficient capacity // when the DocumentMatchPool does not have sufficient capacity
// By default we just perform just-in-time allocation, but you could log // By default we just perform just-in-time allocation, but you could log

@ -70,9 +70,11 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
} }
ss = append(ss, sr) ss = append(ss, sr)
} }
if len(ss) < 1 { if len(ss) < 1 {
return searcher.NewMatchNoneSearcher(i) return searcher.NewMatchNoneSearcher(i)
} }
return searcher.NewConjunctionSearcher(i, ss, options) return searcher.NewConjunctionSearcher(i, ss, options)
} }

@ -58,7 +58,8 @@ func (q *DisjunctionQuery) SetMin(m float64) {
q.Min = m q.Min = m
} }
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
options search.SearcherOptions) (search.Searcher, error) {
ss := make([]search.Searcher, 0, len(q.Disjuncts)) ss := make([]search.Searcher, 0, len(q.Disjuncts))
for _, disjunct := range q.Disjuncts { for _, disjunct := range q.Disjuncts {
sr, err := disjunct.Searcher(i, m, options) sr, err := disjunct.Searcher(i, m, options)
@ -76,9 +77,17 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
} }
ss = append(ss, sr) ss = append(ss, sr)
} }
if len(ss) < 1 { if len(ss) < 1 {
return searcher.NewMatchNoneSearcher(i) return searcher.NewMatchNoneSearcher(i)
} else if len(ss) == 1 && int(q.Min) == ss[0].Min() {
// apply optimization only if both conditions below are satisfied:
// - disjunction searcher has only 1 child searcher
// - parent searcher's min setting is equal to child searcher's min
return ss[0], nil
} }
return searcher.NewDisjunctionSearcher(i, ss, q.Min, options) return searcher.NewDisjunctionSearcher(i, ss, q.Min, options)
} }

@ -296,32 +296,28 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) {
} }
expand = func(query Query) (Query, error) { expand = func(query Query) (Query, error) {
switch query.(type) { switch q := query.(type) {
case *QueryStringQuery: case *QueryStringQuery:
q := query.(*QueryStringQuery)
parsed, err := parseQuerySyntax(q.Query) parsed, err := parseQuerySyntax(q.Query)
if err != nil { if err != nil {
return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err) return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err)
} }
return expand(parsed) return expand(parsed)
case *ConjunctionQuery: case *ConjunctionQuery:
q := *query.(*ConjunctionQuery)
children, err := expandSlice(q.Conjuncts) children, err := expandSlice(q.Conjuncts)
if err != nil { if err != nil {
return nil, err return nil, err
} }
q.Conjuncts = children q.Conjuncts = children
return &q, nil return q, nil
case *DisjunctionQuery: case *DisjunctionQuery:
q := *query.(*DisjunctionQuery)
children, err := expandSlice(q.Disjuncts) children, err := expandSlice(q.Disjuncts)
if err != nil { if err != nil {
return nil, err return nil, err
} }
q.Disjuncts = children q.Disjuncts = children
return &q, nil return q, nil
case *BooleanQuery: case *BooleanQuery:
q := *query.(*BooleanQuery)
var err error var err error
q.Must, err = expand(q.Must) q.Must, err = expand(q.Must)
if err != nil { if err != nil {
@ -335,7 +331,7 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
return &q, nil return q, nil
default: default:
return query, nil return query, nil
} }

@ -273,6 +273,7 @@ func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
// see where to go // see where to go
if !l.seenDot && next == '.' { if !l.seenDot && next == '.' {
// stay in this state // stay in this state
l.seenDot = true
l.buf += string(next) l.buf += string(next)
return inNumOrStrState, true return inNumOrStrState, true
} else if unicode.IsDigit(next) { } else if unicode.IsDigit(next) {

@ -15,7 +15,6 @@
package query package query
import ( import (
"regexp"
"strings" "strings"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
@ -28,7 +27,6 @@ type RegexpQuery struct {
Regexp string `json:"regexp"` Regexp string `json:"regexp"`
FieldVal string `json:"field,omitempty"` FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"` BoostVal *Boost `json:"boost,omitempty"`
compiled *regexp.Regexp
} }
// NewRegexpQuery creates a new Query which finds // NewRegexpQuery creates a new Query which finds
@ -64,33 +62,20 @@ func (q *RegexpQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, opti
if q.FieldVal == "" { if q.FieldVal == "" {
field = m.DefaultSearchField() field = m.DefaultSearchField()
} }
err := q.compile()
if err != nil {
return nil, err
}
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options)
}
func (q *RegexpQuery) Validate() error { // require that pattern NOT be anchored to start and end of term.
return q.compile() // do not attempt to remove trailing $, its presence is not
} // known to interfere with LiteralPrefix() the way ^ does
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
func (q *RegexpQuery) compile() error {
if q.compiled == nil {
// require that pattern NOT be anchored to start and end of term
actualRegexp := q.Regexp actualRegexp := q.Regexp
if strings.HasPrefix(actualRegexp, "^") { if strings.HasPrefix(actualRegexp, "^") {
actualRegexp = actualRegexp[1:] // remove leading ^ actualRegexp = actualRegexp[1:] // remove leading ^
} }
// do not attempt to remove trailing $, it's presence is not
// known to interfere with LiteralPrefix() the way ^ does return searcher.NewRegexpStringSearcher(i, actualRegexp, field,
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc q.BoostVal.Value(), options)
var err error
q.compiled, err = regexp.Compile(actualRegexp)
if err != nil {
return err
}
} }
return nil
func (q *RegexpQuery) Validate() error {
return nil // real validation delayed until searcher constructor
} }

@ -15,7 +15,6 @@
package query package query
import ( import (
"regexp"
"strings" "strings"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
@ -47,7 +46,6 @@ type WildcardQuery struct {
Wildcard string `json:"wildcard"` Wildcard string `json:"wildcard"`
FieldVal string `json:"field,omitempty"` FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"` BoostVal *Boost `json:"boost,omitempty"`
compiled *regexp.Regexp
} }
// NewWildcardQuery creates a new Query which finds // NewWildcardQuery creates a new Query which finds
@ -83,24 +81,13 @@ func (q *WildcardQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, op
if q.FieldVal == "" { if q.FieldVal == "" {
field = m.DefaultSearchField() field = m.DefaultSearchField()
} }
if q.compiled == nil {
var err error
q.compiled, err = q.convertToRegexp()
if err != nil {
return nil, err
}
}
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
}
func (q *WildcardQuery) Validate() error { return searcher.NewRegexpStringSearcher(i, regexpString, field,
var err error q.BoostVal.Value(), options)
q.compiled, err = q.convertToRegexp()
return err
} }
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) { func (q *WildcardQuery) Validate() error {
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) return nil // real validation delayed until searcher constructor
return regexp.Compile(regexpString)
} }

@ -15,13 +15,27 @@
package scorer package scorer
import ( import (
"reflect"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeConjunctionQueryScorer int
func init() {
var cqs ConjunctionQueryScorer
reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size())
}
type ConjunctionQueryScorer struct { type ConjunctionQueryScorer struct {
options search.SearcherOptions options search.SearcherOptions
} }
func (s *ConjunctionQueryScorer) Size() int {
return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr
}
func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer { func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer {
return &ConjunctionQueryScorer{ return &ConjunctionQueryScorer{
options: options, options: options,
@ -35,15 +49,11 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
childrenExplanations = make([]*search.Explanation, len(constituents)) childrenExplanations = make([]*search.Explanation, len(constituents))
} }
locations := []search.FieldTermLocationMap{}
for i, docMatch := range constituents { for i, docMatch := range constituents {
sum += docMatch.Score sum += docMatch.Score
if s.options.Explain { if s.options.Explain {
childrenExplanations[i] = docMatch.Expl childrenExplanations[i] = docMatch.Expl
} }
if docMatch.Locations != nil {
locations = append(locations, docMatch.Locations)
}
} }
newScore := sum newScore := sum
var newExpl *search.Explanation var newExpl *search.Explanation
@ -55,11 +65,8 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
rv := constituents[0] rv := constituents[0]
rv.Score = newScore rv.Score = newScore
rv.Expl = newExpl rv.Expl = newExpl
if len(locations) == 1 { rv.FieldTermLocations = search.MergeFieldTermLocations(
rv.Locations = locations[0] rv.FieldTermLocations, constituents[1:])
} else if len(locations) > 1 {
rv.Locations = search.MergeLocations(locations)
}
return rv return rv
} }

@ -16,11 +16,20 @@ package scorer
import ( import (
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeConstantScorer int
func init() {
var cs ConstantScorer
reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size())
}
type ConstantScorer struct { type ConstantScorer struct {
constant float64 constant float64
boost float64 boost float64
@ -30,6 +39,16 @@ type ConstantScorer struct {
queryWeightExplanation *search.Explanation queryWeightExplanation *search.Explanation
} }
func (s *ConstantScorer) Size() int {
sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr
if s.queryWeightExplanation != nil {
sizeInBytes += s.queryWeightExplanation.Size()
}
return sizeInBytes
}
func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer { func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer {
rv := ConstantScorer{ rv := ConstantScorer{
options: options, options: options,

@ -16,14 +16,27 @@ package scorer
import ( import (
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeDisjunctionQueryScorer int
func init() {
var dqs DisjunctionQueryScorer
reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size())
}
type DisjunctionQueryScorer struct { type DisjunctionQueryScorer struct {
options search.SearcherOptions options search.SearcherOptions
} }
func (s *DisjunctionQueryScorer) Size() int {
return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr
}
func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer { func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer {
return &DisjunctionQueryScorer{ return &DisjunctionQueryScorer{
options: options, options: options,
@ -37,15 +50,11 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
childrenExplanations = make([]*search.Explanation, len(constituents)) childrenExplanations = make([]*search.Explanation, len(constituents))
} }
var locations []search.FieldTermLocationMap
for i, docMatch := range constituents { for i, docMatch := range constituents {
sum += docMatch.Score sum += docMatch.Score
if s.options.Explain { if s.options.Explain {
childrenExplanations[i] = docMatch.Expl childrenExplanations[i] = docMatch.Expl
} }
if docMatch.Locations != nil {
locations = append(locations, docMatch.Locations)
}
} }
var rawExpl *search.Explanation var rawExpl *search.Explanation
@ -67,11 +76,8 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
rv := constituents[0] rv := constituents[0]
rv.Score = newScore rv.Score = newScore
rv.Expl = newExpl rv.Expl = newExpl
if len(locations) == 1 { rv.FieldTermLocations = search.MergeFieldTermLocations(
rv.Locations = locations[0] rv.FieldTermLocations, constituents[1:])
} else if len(locations) > 1 {
rv.Locations = search.MergeLocations(locations)
}
return rv return rv
} }

@ -17,13 +17,22 @@ package scorer
import ( import (
"fmt" "fmt"
"math" "math"
"reflect"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeTermQueryScorer int
func init() {
var tqs TermQueryScorer
reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size())
}
type TermQueryScorer struct { type TermQueryScorer struct {
queryTerm []byte queryTerm string
queryField string queryField string
queryBoost float64 queryBoost float64
docTerm uint64 docTerm uint64
@ -36,9 +45,24 @@ type TermQueryScorer struct {
queryWeightExplanation *search.Explanation queryWeightExplanation *search.Explanation
} }
func (s *TermQueryScorer) Size() int {
sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr +
len(s.queryTerm) + len(s.queryField)
if s.idfExplanation != nil {
sizeInBytes += s.idfExplanation.Size()
}
if s.queryWeightExplanation != nil {
sizeInBytes += s.queryWeightExplanation.Size()
}
return sizeInBytes
}
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer {
rv := TermQueryScorer{ rv := TermQueryScorer{
queryTerm: queryTerm, queryTerm: string(queryTerm),
queryField: queryField, queryField: queryField,
queryBoost: queryBoost, queryBoost: queryBoost,
docTerm: docTerm, docTerm: docTerm,
@ -82,7 +106,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
} }
s.queryWeightExplanation = &search.Explanation{ s.queryWeightExplanation = &search.Explanation{
Value: s.queryWeight, Value: s.queryWeight,
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, string(s.queryTerm), s.queryBoost), Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost),
Children: childrenExplanations, Children: childrenExplanations,
} }
} }
@ -104,7 +128,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
childrenExplanations := make([]*search.Explanation, 3) childrenExplanations := make([]*search.Explanation, 3)
childrenExplanations[0] = &search.Explanation{ childrenExplanations[0] = &search.Explanation{
Value: tf, Value: tf,
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, string(s.queryTerm), termMatch.Freq), Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
} }
childrenExplanations[1] = &search.Explanation{ childrenExplanations[1] = &search.Explanation{
Value: termMatch.Norm, Value: termMatch.Norm,
@ -113,7 +137,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
childrenExplanations[2] = s.idfExplanation childrenExplanations[2] = s.idfExplanation
scoreExplanation = &search.Explanation{ scoreExplanation = &search.Explanation{
Value: score, Value: score,
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, string(s.queryTerm), termMatch.ID), Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID),
Children: childrenExplanations, Children: childrenExplanations,
} }
} }
@ -127,7 +151,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
childExplanations[1] = scoreExplanation childExplanations[1] = scoreExplanation
scoreExplanation = &search.Explanation{ scoreExplanation = &search.Explanation{
Value: score, Value: score,
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, string(s.queryTerm), s.queryBoost, termMatch.ID), Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID),
Children: childExplanations, Children: childExplanations,
} }
} }
@ -140,41 +164,31 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
rv.Expl = scoreExplanation rv.Expl = scoreExplanation
} }
if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 { if len(termMatch.Vectors) > 0 {
locs := make([]search.Location, len(termMatch.Vectors)) if cap(rv.FieldTermLocations) < len(termMatch.Vectors) {
locsUsed := 0 rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors))
totalPositions := 0
for _, v := range termMatch.Vectors {
totalPositions += len(v.ArrayPositions)
} }
positions := make(search.ArrayPositions, totalPositions)
positionsUsed := 0
rv.Locations = make(search.FieldTermLocationMap)
for _, v := range termMatch.Vectors { for _, v := range termMatch.Vectors {
tlm := rv.Locations[v.Field] var ap search.ArrayPositions
if tlm == nil {
tlm = make(search.TermLocationMap)
rv.Locations[v.Field] = tlm
}
loc := &locs[locsUsed]
locsUsed++
loc.Pos = v.Pos
loc.Start = v.Start
loc.End = v.End
if len(v.ArrayPositions) > 0 { if len(v.ArrayPositions) > 0 {
loc.ArrayPositions = positions[positionsUsed : positionsUsed+len(v.ArrayPositions)] n := len(rv.FieldTermLocations)
for i, ap := range v.ArrayPositions { if n < cap(rv.FieldTermLocations) { // reuse ap slice if available
loc.ArrayPositions[i] = ap ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0]
} }
positionsUsed += len(v.ArrayPositions) ap = append(ap, v.ArrayPositions...)
} }
rv.FieldTermLocations =
tlm[string(s.queryTerm)] = append(tlm[string(s.queryTerm)], loc) append(rv.FieldTermLocations, search.FieldTermLocation{
Field: v.Field,
Term: s.queryTerm,
Location: search.Location{
Pos: v.Pos,
Start: v.Start,
End: v.End,
ArrayPositions: ap,
},
})
} }
} }

@ -16,11 +16,25 @@ package search
import ( import (
"fmt" "fmt"
"reflect"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeDocumentMatch int
var reflectStaticSizeSearchContext int
var reflectStaticSizeLocation int
func init() {
var dm DocumentMatch
reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size())
var sc SearchContext
reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size())
var l Location
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
}
type ArrayPositions []uint64 type ArrayPositions []uint64
func (ap ArrayPositions) Equals(other ArrayPositions) bool { func (ap ArrayPositions) Equals(other ArrayPositions) bool {
@ -47,6 +61,11 @@ type Location struct {
ArrayPositions ArrayPositions `json:"array_positions"` ArrayPositions ArrayPositions `json:"array_positions"`
} }
func (l *Location) Size() int {
return reflectStaticSizeLocation + size.SizeOfPtr +
len(l.ArrayPositions)*size.SizeOfUint64
}
type Locations []*Location type Locations []*Location
type TermLocationMap map[string]Locations type TermLocationMap map[string]Locations
@ -57,6 +76,12 @@ func (t TermLocationMap) AddLocation(term string, location *Location) {
type FieldTermLocationMap map[string]TermLocationMap type FieldTermLocationMap map[string]TermLocationMap
type FieldTermLocation struct {
Field string
Term string
Location Location
}
type FieldFragmentMap map[string][]string type FieldFragmentMap map[string][]string
type DocumentMatch struct { type DocumentMatch struct {
@ -74,11 +99,14 @@ type DocumentMatch struct {
// fields as float64s and date fields as time.RFC3339 formatted strings. // fields as float64s and date fields as time.RFC3339 formatted strings.
Fields map[string]interface{} `json:"fields,omitempty"` Fields map[string]interface{} `json:"fields,omitempty"`
// if we load the document for this hit, remember it so we dont load again
Document *document.Document `json:"-"`
// used to maintain natural index order // used to maintain natural index order
HitNumber uint64 `json:"-"` HitNumber uint64 `json:"-"`
// used to temporarily hold field term location information during
// search processing in an efficient, recycle-friendly manner, to
// be later incorporated into the Locations map when search
// results are completed
FieldTermLocations []FieldTermLocation `json:"-"`
} }
func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {
@ -108,15 +136,116 @@ func (dm *DocumentMatch) Reset() *DocumentMatch {
indexInternalID := dm.IndexInternalID indexInternalID := dm.IndexInternalID
// remember the []interface{} used for sort // remember the []interface{} used for sort
sort := dm.Sort sort := dm.Sort
// remember the FieldTermLocations backing array
ftls := dm.FieldTermLocations
for i := range ftls { // recycle the ArrayPositions of each location
ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0]
}
// idiom to copy over from empty DocumentMatch (0 allocations) // idiom to copy over from empty DocumentMatch (0 allocations)
*dm = DocumentMatch{} *dm = DocumentMatch{}
// reuse the []byte already allocated (and reset len to 0) // reuse the []byte already allocated (and reset len to 0)
dm.IndexInternalID = indexInternalID[:0] dm.IndexInternalID = indexInternalID[:0]
// reuse the []interface{} already allocated (and reset len to 0) // reuse the []interface{} already allocated (and reset len to 0)
dm.Sort = sort[:0] dm.Sort = sort[:0]
// reuse the FieldTermLocations already allocated (and reset len to 0)
dm.FieldTermLocations = ftls[:0]
return dm return dm
} }
func (dm *DocumentMatch) Size() int {
sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr +
len(dm.Index) +
len(dm.ID) +
len(dm.IndexInternalID)
if dm.Expl != nil {
sizeInBytes += dm.Expl.Size()
}
for k, v := range dm.Locations {
sizeInBytes += size.SizeOfString + len(k)
for k1, v1 := range v {
sizeInBytes += size.SizeOfString + len(k1) +
size.SizeOfSlice
for _, entry := range v1 {
sizeInBytes += entry.Size()
}
}
}
for k, v := range dm.Fragments {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfSlice
for _, entry := range v {
sizeInBytes += size.SizeOfString + len(entry)
}
}
for _, entry := range dm.Sort {
sizeInBytes += size.SizeOfString + len(entry)
}
for k, _ := range dm.Fields {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfPtr
}
return sizeInBytes
}
// Complete performs final preparation & transformation of the
// DocumentMatch at the end of search processing, also allowing the
// caller to provide an optional preallocated locations slice
func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
// transform the FieldTermLocations slice into the Locations map
nlocs := len(dm.FieldTermLocations)
if nlocs > 0 {
if cap(prealloc) < nlocs {
prealloc = make([]Location, nlocs)
}
prealloc = prealloc[:nlocs]
var lastField string
var tlm TermLocationMap
for i, ftl := range dm.FieldTermLocations {
if lastField != ftl.Field {
lastField = ftl.Field
if dm.Locations == nil {
dm.Locations = make(FieldTermLocationMap)
}
tlm = dm.Locations[ftl.Field]
if tlm == nil {
tlm = make(TermLocationMap)
dm.Locations[ftl.Field] = tlm
}
}
loc := &prealloc[i]
*loc = ftl.Location
if len(loc.ArrayPositions) > 0 { // copy
loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...)
}
tlm[ftl.Term] = append(tlm[ftl.Term], loc)
dm.FieldTermLocations[i] = FieldTermLocation{ // recycle
Location: Location{
ArrayPositions: ftl.Location.ArrayPositions[:0],
},
}
}
}
dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle
return prealloc
}
func (dm *DocumentMatch) String() string { func (dm *DocumentMatch) String() string {
return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score) return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)
} }
@ -135,6 +264,7 @@ type Searcher interface {
SetQueryNorm(float64) SetQueryNorm(float64)
Count() uint64 Count() uint64
Min() int Min() int
Size() int
DocumentMatchPoolSize() int DocumentMatchPoolSize() int
} }
@ -142,9 +272,26 @@ type Searcher interface {
type SearcherOptions struct { type SearcherOptions struct {
Explain bool Explain bool
IncludeTermVectors bool IncludeTermVectors bool
Score string
} }
// SearchContext represents the context around a single search // SearchContext represents the context around a single search
type SearchContext struct { type SearchContext struct {
DocumentMatchPool *DocumentMatchPool DocumentMatchPool *DocumentMatchPool
Collector Collector
}
func (sc *SearchContext) Size() int {
sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr +
reflectStaticSizeDocumentMatchPool + size.SizeOfPtr
if sc.DocumentMatchPool != nil {
for _, entry := range sc.DocumentMatchPool.avail {
if entry != nil {
sizeInBytes += entry.Size()
}
}
}
return sizeInBytes
} }

@ -16,12 +16,21 @@ package searcher
import ( import (
"math" "math"
"reflect"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer" "github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeBooleanSearcher int
func init() {
var bs BooleanSearcher
reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size())
}
type BooleanSearcher struct { type BooleanSearcher struct {
indexReader index.IndexReader indexReader index.IndexReader
mustSearcher search.Searcher mustSearcher search.Searcher
@ -52,6 +61,32 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc
return &rv, nil return &rv, nil
} }
func (s *BooleanSearcher) Size() int {
sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr
if s.mustSearcher != nil {
sizeInBytes += s.mustSearcher.Size()
}
if s.shouldSearcher != nil {
sizeInBytes += s.shouldSearcher.Size()
}
if s.mustNotSearcher != nil {
sizeInBytes += s.mustNotSearcher.Size()
}
sizeInBytes += s.scorer.Size()
for _, entry := range s.matches {
if entry != nil {
sizeInBytes += entry.Size()
}
}
return sizeInBytes
}
func (s *BooleanSearcher) computeQueryNorm() { func (s *BooleanSearcher) computeQueryNorm() {
// first calculate sum of squared weights // first calculate sum of squared weights
sumOfSquaredWeights := 0.0 sumOfSquaredWeights := 0.0
@ -284,6 +319,7 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch
return nil, err return nil, err
} }
} }
return rv, nil return rv, nil
} }
@ -296,6 +332,14 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
} }
} }
// Advance the searchers only if the currentID cursor is trailing the lookup ID,
// additionally if the mustNotSearcher has been initialized, ensure that the
// cursor used to track the mustNotSearcher (currMustNot, which isn't tracked by
// currentID) is trailing the lookup ID as well - for in the case where currentID
// is nil and currMustNot is already at or ahead of the lookup ID, we MUST NOT
// advance the currentID or the currMustNot cursors.
if (s.currentID == nil || s.currentID.Compare(ID) < 0) &&
(s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0) {
var err error var err error
if s.mustSearcher != nil { if s.mustSearcher != nil {
if s.currMust != nil { if s.currMust != nil {
@ -306,6 +350,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
return nil, err return nil, err
} }
} }
if s.shouldSearcher != nil { if s.shouldSearcher != nil {
if s.currShould != nil { if s.currShould != nil {
ctx.DocumentMatchPool.Put(s.currShould) ctx.DocumentMatchPool.Put(s.currShould)
@ -315,6 +360,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
return nil, err return nil, err
} }
} }
if s.mustNotSearcher != nil { if s.mustNotSearcher != nil {
if s.currMustNot != nil { if s.currMustNot != nil {
ctx.DocumentMatchPool.Put(s.currMustNot) ctx.DocumentMatchPool.Put(s.currMustNot)
@ -332,6 +378,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
} else { } else {
s.currentID = nil s.currentID = nil
} }
}
return s.Next(ctx) return s.Next(ctx)
} }

@ -16,13 +16,22 @@ package searcher
import ( import (
"math" "math"
"reflect"
"sort" "sort"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer" "github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeConjunctionSearcher int
func init() {
var cs ConjunctionSearcher
reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size())
}
type ConjunctionSearcher struct { type ConjunctionSearcher struct {
indexReader index.IndexReader indexReader index.IndexReader
searchers OrderedSearcherList searchers OrderedSearcherList
@ -34,14 +43,27 @@ type ConjunctionSearcher struct {
options search.SearcherOptions options search.SearcherOptions
} }
func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) (*ConjunctionSearcher, error) { func NewConjunctionSearcher(indexReader index.IndexReader,
// build the downstream searchers qsearchers []search.Searcher, options search.SearcherOptions) (
search.Searcher, error) {
// build the sorted downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers)) searchers := make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers { for i, searcher := range qsearchers {
searchers[i] = searcher searchers[i] = searcher
} }
// sort the searchers
sort.Sort(searchers) sort.Sort(searchers)
// attempt the "unadorned" conjunction optimization only when we
// do not need extra information like freq-norm's or term vectors
if len(searchers) > 1 &&
options.Score == "none" && !options.IncludeTermVectors {
rv, err := optimizeCompositeSearcher("conjunction:unadorned",
indexReader, searchers, options)
if err != nil || rv != nil {
return rv, err
}
}
// build our searcher // build our searcher
rv := ConjunctionSearcher{ rv := ConjunctionSearcher{
indexReader: indexReader, indexReader: indexReader,
@ -51,9 +73,36 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S
scorer: scorer.NewConjunctionQueryScorer(options), scorer: scorer.NewConjunctionQueryScorer(options),
} }
rv.computeQueryNorm() rv.computeQueryNorm()
// attempt push-down conjunction optimization when there's >1 searchers
if len(searchers) > 1 {
rv, err := optimizeCompositeSearcher("conjunction",
indexReader, searchers, options)
if err != nil || rv != nil {
return rv, err
}
}
return &rv, nil return &rv, nil
} }
func (s *ConjunctionSearcher) Size() int {
sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr +
s.scorer.Size()
for _, entry := range s.searchers {
sizeInBytes += entry.Size()
}
for _, entry := range s.currs {
if entry != nil {
sizeInBytes += entry.Size()
}
}
return sizeInBytes
}
func (s *ConjunctionSearcher) computeQueryNorm() { func (s *ConjunctionSearcher) computeQueryNorm() {
// first calculate sum of squared weights // first calculate sum of squared weights
sumOfSquaredWeights := 0.0 sumOfSquaredWeights := 0.0
@ -108,7 +157,7 @@ func (s *ConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentM
var rv *search.DocumentMatch var rv *search.DocumentMatch
var err error var err error
OUTER: OUTER:
for s.currs[s.maxIDIdx] != nil { for s.maxIDIdx < len(s.currs) && s.currs[s.maxIDIdx] != nil {
maxID := s.currs[s.maxIDIdx].IndexInternalID maxID := s.currs[s.maxIDIdx].IndexInternalID
i := 0 i := 0

@ -1,4 +1,4 @@
// Copyright (c) 2014 Couchbase, Inc. // Copyright (c) 2018 Couchbase, Inc.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
@ -16,12 +16,9 @@ package searcher
import ( import (
"fmt" "fmt"
"math"
"sort"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
) )
// DisjunctionMaxClauseCount is a compile time setting that applications can // DisjunctionMaxClauseCount is a compile time setting that applications can
@ -29,246 +26,84 @@ import (
// error instead of exeucting searches when the size exceeds this value. // error instead of exeucting searches when the size exceeds this value.
var DisjunctionMaxClauseCount = 0 var DisjunctionMaxClauseCount = 0
type DisjunctionSearcher struct { // DisjunctionHeapTakeover is a compile time setting that applications can
indexReader index.IndexReader // adjust to control when the DisjunctionSearcher will switch from a simple
searchers OrderedSearcherList // slice implementation to a heap implementation.
numSearchers int var DisjunctionHeapTakeover = 10
queryNorm float64
currs []*search.DocumentMatch
scorer *scorer.DisjunctionQueryScorer
min int
matching []*search.DocumentMatch
matchingIdxs []int
initialized bool
}
func tooManyClauses(count int) bool {
if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount {
return true
}
return false
}
func tooManyClausesErr() error {
return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]",
DisjunctionMaxClauseCount)
}
func NewDisjunctionSearcher(indexReader index.IndexReader, func NewDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( qsearchers []search.Searcher, min float64, options search.SearcherOptions) (
*DisjunctionSearcher, error) { search.Searcher, error) {
return newDisjunctionSearcher(indexReader, qsearchers, min, options, return newDisjunctionSearcher(indexReader, qsearchers, min, options, true)
true)
} }
func newDisjunctionSearcher(indexReader index.IndexReader, func newDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions, qsearchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) ( limit bool) (search.Searcher, error) {
*DisjunctionSearcher, error) { // attempt the "unadorned" disjunction optimization only when we
if limit && tooManyClauses(len(qsearchers)) { // do not need extra information like freq-norm's or term vectors
return nil, tooManyClausesErr() // and the requested min is simple
if len(qsearchers) > 1 && min <= 1 &&
options.Score == "none" && !options.IncludeTermVectors {
rv, err := optimizeCompositeSearcher("disjunction:unadorned",
indexReader, qsearchers, options)
if err != nil || rv != nil {
return rv, err
} }
// build the downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers {
searchers[i] = searcher
}
// sort the searchers
sort.Sort(sort.Reverse(searchers))
// build our searcher
rv := DisjunctionSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
currs: make([]*search.DocumentMatch, len(searchers)),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingIdxs: make([]int, len(searchers)),
}
rv.computeQueryNorm()
return &rv, nil
} }
func (s *DisjunctionSearcher) computeQueryNorm() { if len(qsearchers) > DisjunctionHeapTakeover {
// first calculate sum of squared weights return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options,
sumOfSquaredWeights := 0.0 limit)
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
for i, searcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return err
} }
return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options,
limit)
} }
err = s.updateMatches() func optimizeCompositeSearcher(optimizationKind string,
if err != nil { indexReader index.IndexReader, qsearchers []search.Searcher,
return err options search.SearcherOptions) (search.Searcher, error) {
} var octx index.OptimizableContext
s.initialized = true
return nil
}
func (s *DisjunctionSearcher) updateMatches() error {
matching := s.matching[:0]
matchingIdxs := s.matchingIdxs[:0]
for i := 0; i < len(s.currs); i++ {
curr := s.currs[i]
if curr == nil {
continue
}
if len(matching) > 0 {
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
if cmp > 0 {
continue
}
if cmp < 0 {
matching = matching[:0]
matchingIdxs = matchingIdxs[:0]
}
}
matching = append(matching, curr)
matchingIdxs = append(matchingIdxs, i)
}
s.matching = matching
s.matchingIdxs = matchingIdxs
return nil
}
func (s *DisjunctionSearcher) Weight() float64 {
var rv float64
for _, searcher := range s.searchers {
rv += searcher.Weight()
}
return rv
}
func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) { for _, searcher := range qsearchers {
for _, searcher := range s.searchers { o, ok := searcher.(index.Optimizable)
searcher.SetQueryNorm(qnorm) if !ok {
} return nil, nil
} }
func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) (
*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
var err error var err error
var rv *search.DocumentMatch octx, err = o.Optimize(optimizationKind, octx)
found := false
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
found = true
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
}
// invoke next on all the matching searchers
for _, i := range s.matchingIdxs {
searcher := s.searchers[i]
if s.currs[i] != rv {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
if err != nil { if err != nil {
return nil, err return nil, err
} }
}
err = s.updateMatches() if octx == nil {
if err != nil { return nil, nil
return nil, err
}
} }
return rv, nil
} }
func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext, optimized, err := octx.Finish()
ID index.IndexInternalID) (*search.DocumentMatch, error) { if err != nil || optimized == nil {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
// get all searchers pointing at their first match
var err error
for i, searcher := range s.searchers {
if s.currs[i] != nil {
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
continue
}
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
err = s.updateMatches()
if err != nil {
return nil, err return nil, err
} }
return s.Next(ctx) tfr, ok := optimized.(index.TermFieldReader)
if !ok {
return nil, nil
} }
func (s *DisjunctionSearcher) Count() uint64 { return newTermSearcherFromReader(indexReader, tfr,
// for now return a worst case []byte(optimizationKind), "*", 1.0, options)
var sum uint64
for _, searcher := range s.searchers {
sum += searcher.Count()
}
return sum
} }
func (s *DisjunctionSearcher) Close() (rv error) { func tooManyClauses(count int) bool {
for _, searcher := range s.searchers { if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount {
err := searcher.Close() return true
if err != nil && rv == nil {
rv = err
}
}
return rv
} }
return false
func (s *DisjunctionSearcher) Min() int {
return s.min
} }
func (s *DisjunctionSearcher) DocumentMatchPoolSize() int { func tooManyClausesErr(count int) error {
rv := len(s.currs) return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]",
for _, s := range s.searchers { count, DisjunctionMaxClauseCount)
rv += s.DocumentMatchPoolSize()
}
return rv
} }

@ -0,0 +1,343 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package searcher
import (
"bytes"
"container/heap"
"math"
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDisjunctionHeapSearcher int
var reflectStaticSizeSearcherCurr int
func init() {
var dhs DisjunctionHeapSearcher
reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size())
var sc SearcherCurr
reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size())
}
type SearcherCurr struct {
searcher search.Searcher
curr *search.DocumentMatch
}
type DisjunctionHeapSearcher struct {
indexReader index.IndexReader
numSearchers int
scorer *scorer.DisjunctionQueryScorer
min int
queryNorm float64
initialized bool
searchers []search.Searcher
heap []*SearcherCurr
matching []*search.DocumentMatch
matchingCurrs []*SearcherCurr
}
func newDisjunctionHeapSearcher(indexReader index.IndexReader,
searchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (
*DisjunctionHeapSearcher, error) {
if limit && tooManyClauses(len(searchers)) {
return nil, tooManyClausesErr(len(searchers))
}
// build our searcher
rv := DisjunctionHeapSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingCurrs: make([]*SearcherCurr, len(searchers)),
heap: make([]*SearcherCurr, 0, len(searchers)),
}
rv.computeQueryNorm()
return &rv, nil
}
func (s *DisjunctionHeapSearcher) Size() int {
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr +
s.scorer.Size()
for _, entry := range s.searchers {
sizeInBytes += entry.Size()
}
for _, entry := range s.matching {
if entry != nil {
sizeInBytes += entry.Size()
}
}
// for matchingCurrs and heap, just use static size * len
// since searchers and document matches already counted above
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr
return sizeInBytes
}
func (s *DisjunctionHeapSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error {
// alloc a single block of SearcherCurrs
block := make([]SearcherCurr, len(s.searchers))
// get all searchers pointing at their first match
for i, searcher := range s.searchers {
curr, err := searcher.Next(ctx)
if err != nil {
return err
}
if curr != nil {
block[i].searcher = searcher
block[i].curr = curr
heap.Push(s, &block[i])
}
}
err := s.updateMatches()
if err != nil {
return err
}
s.initialized = true
return nil
}
func (s *DisjunctionHeapSearcher) updateMatches() error {
matching := s.matching[:0]
matchingCurrs := s.matchingCurrs[:0]
if len(s.heap) > 0 {
// top of the heap is our next hit
next := heap.Pop(s).(*SearcherCurr)
matching = append(matching, next.curr)
matchingCurrs = append(matchingCurrs, next)
// now as long as top of heap matches, keep popping
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 {
next = heap.Pop(s).(*SearcherCurr)
matching = append(matching, next.curr)
matchingCurrs = append(matchingCurrs, next)
}
}
s.matching = matching
s.matchingCurrs = matchingCurrs
return nil
}
func (s *DisjunctionHeapSearcher) Weight() float64 {
var rv float64
for _, searcher := range s.searchers {
rv += searcher.Weight()
}
return rv
}
func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) {
for _, searcher := range s.searchers {
searcher.SetQueryNorm(qnorm)
}
}
func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) (
*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
var rv *search.DocumentMatch
found := false
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
found = true
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
}
// invoke next on all the matching searchers
for _, matchingCurr := range s.matchingCurrs {
if matchingCurr.curr != rv {
ctx.DocumentMatchPool.Put(matchingCurr.curr)
}
curr, err := matchingCurr.searcher.Next(ctx)
if err != nil {
return nil, err
}
if curr != nil {
matchingCurr.curr = curr
heap.Push(s, matchingCurr)
}
}
err := s.updateMatches()
if err != nil {
return nil, err
}
}
return rv, nil
}
func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext,
ID index.IndexInternalID) (*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
// if there is anything in matching, toss it back onto the heap
for _, matchingCurr := range s.matchingCurrs {
heap.Push(s, matchingCurr)
}
s.matching = s.matching[:0]
s.matchingCurrs = s.matchingCurrs[:0]
// find all searchers that actually need to be advanced
// advance them, using s.matchingCurrs as temp storage
for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 {
searcherCurr := heap.Pop(s).(*SearcherCurr)
ctx.DocumentMatchPool.Put(searcherCurr.curr)
curr, err := searcherCurr.searcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
if curr != nil {
searcherCurr.curr = curr
s.matchingCurrs = append(s.matchingCurrs, searcherCurr)
}
}
// now all of the searchers that we advanced have to be pushed back
for _, matchingCurr := range s.matchingCurrs {
heap.Push(s, matchingCurr)
}
// reset our temp space
s.matchingCurrs = s.matchingCurrs[:0]
err := s.updateMatches()
if err != nil {
return nil, err
}
return s.Next(ctx)
}
func (s *DisjunctionHeapSearcher) Count() uint64 {
// for now return a worst case
var sum uint64
for _, searcher := range s.searchers {
sum += searcher.Count()
}
return sum
}
func (s *DisjunctionHeapSearcher) Close() (rv error) {
for _, searcher := range s.searchers {
err := searcher.Close()
if err != nil && rv == nil {
rv = err
}
}
return rv
}
func (s *DisjunctionHeapSearcher) Min() int {
return s.min
}
func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int {
rv := len(s.searchers)
for _, s := range s.searchers {
rv += s.DocumentMatchPoolSize()
}
return rv
}
// a disjunction searcher implements the index.Optimizable interface
// but only activates on an edge case where the disjunction is a
// wrapper around a single Optimizable child searcher
func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) (
index.OptimizableContext, error) {
if len(s.searchers) == 1 {
o, ok := s.searchers[0].(index.Optimizable)
if ok {
return o.Optimize(kind, octx)
}
}
return octx, nil
}
// heap impl
func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) }
func (s *DisjunctionHeapSearcher) Less(i, j int) bool {
if s.heap[i].curr == nil {
return true
} else if s.heap[j].curr == nil {
return false
}
return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0
}
func (s *DisjunctionHeapSearcher) Swap(i, j int) {
s.heap[i], s.heap[j] = s.heap[j], s.heap[i]
}
func (s *DisjunctionHeapSearcher) Push(x interface{}) {
s.heap = append(s.heap, x.(*SearcherCurr))
}
func (s *DisjunctionHeapSearcher) Pop() interface{} {
old := s.heap
n := len(old)
x := old[n-1]
s.heap = old[0 : n-1]
return x
}

@ -0,0 +1,298 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package searcher
import (
"math"
"reflect"
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDisjunctionSliceSearcher int
func init() {
var ds DisjunctionSliceSearcher
reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size())
}
type DisjunctionSliceSearcher struct {
indexReader index.IndexReader
searchers OrderedSearcherList
numSearchers int
queryNorm float64
currs []*search.DocumentMatch
scorer *scorer.DisjunctionQueryScorer
min int
matching []*search.DocumentMatch
matchingIdxs []int
initialized bool
}
func newDisjunctionSliceSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (
*DisjunctionSliceSearcher, error) {
if limit && tooManyClauses(len(qsearchers)) {
return nil, tooManyClausesErr(len(qsearchers))
}
// build the downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers {
searchers[i] = searcher
}
// sort the searchers
sort.Sort(sort.Reverse(searchers))
// build our searcher
rv := DisjunctionSliceSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
currs: make([]*search.DocumentMatch, len(searchers)),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingIdxs: make([]int, len(searchers)),
}
rv.computeQueryNorm()
return &rv, nil
}
func (s *DisjunctionSliceSearcher) Size() int {
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr +
s.scorer.Size()
for _, entry := range s.searchers {
sizeInBytes += entry.Size()
}
for _, entry := range s.currs {
if entry != nil {
sizeInBytes += entry.Size()
}
}
for _, entry := range s.matching {
if entry != nil {
sizeInBytes += entry.Size()
}
}
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt
return sizeInBytes
}
func (s *DisjunctionSliceSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
for i, searcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return err
}
}
err = s.updateMatches()
if err != nil {
return err
}
s.initialized = true
return nil
}
func (s *DisjunctionSliceSearcher) updateMatches() error {
matching := s.matching[:0]
matchingIdxs := s.matchingIdxs[:0]
for i := 0; i < len(s.currs); i++ {
curr := s.currs[i]
if curr == nil {
continue
}
if len(matching) > 0 {
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
if cmp > 0 {
continue
}
if cmp < 0 {
matching = matching[:0]
matchingIdxs = matchingIdxs[:0]
}
}
matching = append(matching, curr)
matchingIdxs = append(matchingIdxs, i)
}
s.matching = matching
s.matchingIdxs = matchingIdxs
return nil
}
func (s *DisjunctionSliceSearcher) Weight() float64 {
var rv float64
for _, searcher := range s.searchers {
rv += searcher.Weight()
}
return rv
}
func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) {
for _, searcher := range s.searchers {
searcher.SetQueryNorm(qnorm)
}
}
func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) (
*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
var err error
var rv *search.DocumentMatch
found := false
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
found = true
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
}
// invoke next on all the matching searchers
for _, i := range s.matchingIdxs {
searcher := s.searchers[i]
if s.currs[i] != rv {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return nil, err
}
}
err = s.updateMatches()
if err != nil {
return nil, err
}
}
return rv, nil
}
func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext,
ID index.IndexInternalID) (*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
// get all searchers pointing at their first match
var err error
for i, searcher := range s.searchers {
if s.currs[i] != nil {
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
continue
}
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
err = s.updateMatches()
if err != nil {
return nil, err
}
return s.Next(ctx)
}
func (s *DisjunctionSliceSearcher) Count() uint64 {
// for now return a worst case
var sum uint64
for _, searcher := range s.searchers {
sum += searcher.Count()
}
return sum
}
func (s *DisjunctionSliceSearcher) Close() (rv error) {
for _, searcher := range s.searchers {
err := searcher.Close()
if err != nil && rv == nil {
rv = err
}
}
return rv
}
func (s *DisjunctionSliceSearcher) Min() int {
return s.min
}
func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int {
rv := len(s.currs)
for _, s := range s.searchers {
rv += s.DocumentMatchPoolSize()
}
return rv
}
// a disjunction searcher implements the index.Optimizable interface
// but only activates on an edge case where the disjunction is a
// wrapper around a single Optimizable child searcher
func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) (
index.OptimizableContext, error) {
if len(s.searchers) == 1 {
o, ok := s.searchers[0].(index.Optimizable)
if ok {
return o.Optimize(kind, octx)
}
}
return octx, nil
}

@ -15,11 +15,21 @@
package searcher package searcher
import ( import (
"reflect"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer" "github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeDocIDSearcher int
func init() {
var ds DocIDSearcher
reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size())
}
// DocIDSearcher returns documents matching a predefined set of identifiers. // DocIDSearcher returns documents matching a predefined set of identifiers.
type DocIDSearcher struct { type DocIDSearcher struct {
reader index.DocIDReader reader index.DocIDReader
@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64
}, nil }, nil
} }
func (s *DocIDSearcher) Size() int {
return reflectStaticSizeDocIDSearcher + size.SizeOfPtr +
s.reader.Size() +
s.scorer.Size()
}
func (s *DocIDSearcher) Count() uint64 { func (s *DocIDSearcher) Count() uint64 {
return uint64(s.count) return uint64(s.count)
} }

@ -15,10 +15,20 @@
package searcher package searcher
import ( import (
"reflect"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
) )
var reflectStaticSizeFilteringSearcher int
func init() {
var fs FilteringSearcher
reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size())
}
// FilterFunc defines a function which can filter documents // FilterFunc defines a function which can filter documents
// returning true means keep the document // returning true means keep the document
// returning false means do not keep the document // returning false means do not keep the document
@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch
} }
} }
func (f *FilteringSearcher) Size() int {
return reflectStaticSizeFilteringSearcher + size.SizeOfPtr +
f.child.Size()
}
func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
next, err := f.child.Next(ctx) next, err := f.child.Next(ctx)
for next != nil && err == nil { for next != nil && err == nil {

@ -15,13 +15,26 @@
package searcher package searcher
import ( import (
"fmt"
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search"
) )
var MaxFuzziness = 2
func NewFuzzySearcher(indexReader index.IndexReader, term string, func NewFuzzySearcher(indexReader index.IndexReader, term string,
prefix, fuzziness int, field string, boost float64, prefix, fuzziness int, field string, boost float64,
options search.SearcherOptions) (search.Searcher, error) { options search.SearcherOptions) (search.Searcher, error) {
if fuzziness > MaxFuzziness {
return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
}
if fuzziness < 0 {
return nil, fmt.Errorf("invalid fuzziness, negative")
}
// Note: we don't byte slice the term for a prefix because of runes. // Note: we don't byte slice the term for a prefix because of runes.
prefixTerm := "" prefixTerm := ""
for i, r := range term { for i, r := range term {
@ -31,7 +44,6 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
break break
} }
} }
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
field, prefixTerm) field, prefixTerm)
if err != nil { if err != nil {
@ -45,12 +57,40 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
fuzziness int, field, prefixTerm string) (rv []string, err error) { fuzziness int, field, prefixTerm string) (rv []string, err error) {
rv = make([]string, 0) rv = make([]string, 0)
// in case of advanced reader implementations directly call
// the levenshtein automaton based iterator to collect the
// candidate terms
if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm)
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv = append(rv, tfd.Term)
if tooManyClauses(len(rv)) {
return nil, tooManyClausesErr(len(rv))
}
tfd, err = fieldDict.Next()
}
return rv, err
}
var fieldDict index.FieldDict var fieldDict index.FieldDict
if len(prefixTerm) > 0 { if len(prefixTerm) > 0 {
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
} else { } else {
fieldDict, err = indexReader.FieldDict(field) fieldDict, err = indexReader.FieldDict(field)
} }
if err != nil {
return nil, err
}
defer func() { defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil { if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr err = cerr
@ -58,13 +98,16 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
}() }()
// enumerate terms and check levenshtein distance // enumerate terms and check levenshtein distance
var reuse []int
tfd, err := fieldDict.Next() tfd, err := fieldDict.Next()
for err == nil && tfd != nil { for err == nil && tfd != nil {
ld, exceeded := search.LevenshteinDistanceMax(term, tfd.Term, fuzziness) var ld int
var exceeded bool
ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
if !exceeded && ld <= fuzziness { if !exceeded && ld <= fuzziness {
rv = append(rv, tfd.Term) rv = append(rv, tfd.Term)
if tooManyClauses(len(rv)) { if tooManyClauses(len(rv)) {
return rv, tooManyClausesErr() return nil, tooManyClausesErr(len(rv))
} }
} }
tfd, err = fieldDict.Next() tfd, err = fieldDict.Next()

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save