Reduce repo indexer disk usage (#3452)
parent
283e87d814
commit
a89592d4ab
@ -0,0 +1,53 @@ |
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package unique |
||||
|
||||
import ( |
||||
"github.com/blevesearch/bleve/analysis" |
||||
"github.com/blevesearch/bleve/registry" |
||||
) |
||||
|
||||
const Name = "unique" |
||||
|
||||
// UniqueTermFilter retains only the tokens which mark the first occurence of
|
||||
// a term. Tokens whose term appears in a preceding token are dropped.
|
||||
type UniqueTermFilter struct{} |
||||
|
||||
func NewUniqueTermFilter() *UniqueTermFilter { |
||||
return &UniqueTermFilter{} |
||||
} |
||||
|
||||
func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream { |
||||
encounteredTerms := make(map[string]struct{}, len(input)/4) |
||||
j := 0 |
||||
for _, token := range input { |
||||
term := string(token.Term) |
||||
if _, ok := encounteredTerms[term]; ok { |
||||
continue |
||||
} |
||||
encounteredTerms[term] = struct{}{} |
||||
input[j] = token |
||||
j++ |
||||
} |
||||
return input[:j] |
||||
} |
||||
|
||||
func UniqueTermFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { |
||||
return NewUniqueTermFilter(), nil |
||||
} |
||||
|
||||
func init() { |
||||
registry.RegisterTokenFilter(Name, UniqueTermFilterConstructor) |
||||
} |
@ -0,0 +1,173 @@ |
||||
# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. |
||||
|
||||
|
||||
[[projects]] |
||||
name = "github.com/RoaringBitmap/roaring" |
||||
packages = ["."] |
||||
revision = "84551f0e309d6f9bafa428ef39b31ab7f16ff7b8" |
||||
version = "v0.4.1" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/Smerity/govarint" |
||||
packages = ["."] |
||||
revision = "7265e41f48f15fd61751e16da866af3c704bb3ab" |
||||
|
||||
[[projects]] |
||||
name = "github.com/blevesearch/bleve" |
||||
packages = [ |
||||
".", |
||||
"analysis", |
||||
"analysis/analyzer/standard", |
||||
"analysis/datetime/flexible", |
||||
"analysis/datetime/optional", |
||||
"analysis/lang/en", |
||||
"analysis/token/lowercase", |
||||
"analysis/token/porter", |
||||
"analysis/token/stop", |
||||
"analysis/tokenizer/unicode", |
||||
"document", |
||||
"geo", |
||||
"index", |
||||
"index/scorch", |
||||
"index/scorch/mergeplan", |
||||
"index/scorch/segment", |
||||
"index/scorch/segment/mem", |
||||
"index/scorch/segment/zap", |
||||
"index/store", |
||||
"index/store/boltdb", |
||||
"index/store/gtreap", |
||||
"index/upsidedown", |
||||
"mapping", |
||||
"numeric", |
||||
"registry", |
||||
"search", |
||||
"search/collector", |
||||
"search/facet", |
||||
"search/highlight", |
||||
"search/highlight/format/html", |
||||
"search/highlight/fragmenter/simple", |
||||
"search/highlight/highlighter/html", |
||||
"search/highlight/highlighter/simple", |
||||
"search/query", |
||||
"search/scorer", |
||||
"search/searcher" |
||||
] |
||||
revision = "a3b125508b4443344b596888ca58467b6c9310b9" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/blevesearch/go-porterstemmer" |
||||
packages = ["."] |
||||
revision = "23a2c8e5cf1f380f27722c6d2ae8896431dc7d0e" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/blevesearch/segment" |
||||
packages = ["."] |
||||
revision = "762005e7a34fd909a84586299f1dd457371d36ee" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/boltdb/bolt" |
||||
packages = ["."] |
||||
revision = "9da31745363232bc1e27dbab3569e77383a51585" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/couchbase/vellum" |
||||
packages = [ |
||||
".", |
||||
"regexp", |
||||
"utf8" |
||||
] |
||||
revision = "ed84a675e24ed0a0bf6859b1ddec7e7c858354bd" |
||||
|
||||
[[projects]] |
||||
name = "github.com/davecgh/go-spew" |
||||
packages = ["spew"] |
||||
revision = "346938d642f2ec3594ed81d874461961cd0faa76" |
||||
version = "v1.1.0" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/edsrzf/mmap-go" |
||||
packages = ["."] |
||||
revision = "0bce6a6887123b67a60366d2c9fe2dfb74289d2e" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/glycerine/go-unsnap-stream" |
||||
packages = ["."] |
||||
revision = "62a9a9eb44fd8932157b1a8ace2149eff5971af6" |
||||
|
||||
[[projects]] |
||||
name = "github.com/golang/protobuf" |
||||
packages = ["proto"] |
||||
revision = "925541529c1fa6821df4e44ce2723319eb2be768" |
||||
version = "v1.0.0" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/golang/snappy" |
||||
packages = ["."] |
||||
revision = "553a641470496b2327abcac10b36396bd98e45c9" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/mschoch/smat" |
||||
packages = ["."] |
||||
revision = "90eadee771aeab36e8bf796039b8c261bebebe4f" |
||||
|
||||
[[projects]] |
||||
name = "github.com/philhofer/fwd" |
||||
packages = ["."] |
||||
revision = "bb6d471dc95d4fe11e432687f8b70ff496cf3136" |
||||
version = "v1.0.0" |
||||
|
||||
[[projects]] |
||||
name = "github.com/pmezard/go-difflib" |
||||
packages = ["difflib"] |
||||
revision = "792786c7400a136282c1664665ae0a8db921c6c2" |
||||
version = "v1.0.0" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/steveyen/gtreap" |
||||
packages = ["."] |
||||
revision = "0abe01ef9be25c4aedc174758ec2d917314d6d70" |
||||
|
||||
[[projects]] |
||||
name = "github.com/stretchr/testify" |
||||
packages = ["assert"] |
||||
revision = "12b6f73e6084dad08a7c6e575284b177ecafbc71" |
||||
version = "v1.2.1" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/tinylib/msgp" |
||||
packages = ["msgp"] |
||||
revision = "03a79185462ad029a6e7e05b2f3f3e0498d0a6c0" |
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/willf/bitset" |
||||
packages = ["."] |
||||
revision = "1a37ad96e8c1a11b20900a232874843b5174221f" |
||||
|
||||
[[projects]] |
||||
name = "golang.org/x/net" |
||||
packages = ["context"] |
||||
revision = "309822c5b9b9f80db67f016069a12628d94fad34" |
||||
|
||||
[[projects]] |
||||
name = "golang.org/x/sys" |
||||
packages = ["unix"] |
||||
revision = "3dbebcf8efb6a5011a60c2b4591c1022a759af8a" |
||||
|
||||
[solve-meta] |
||||
analyzer-name = "dep" |
||||
analyzer-version = 1 |
||||
inputs-digest = "61c759f0c1136cadf86ae8a30bb78edf33fc844cdcb2316469b4ae14a8d051b0" |
||||
solver-name = "gps-cdcl" |
||||
solver-version = 1 |
@ -0,0 +1,34 @@ |
||||
# Gopkg.toml example |
||||
# |
||||
# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md |
||||
# for detailed Gopkg.toml documentation. |
||||
# |
||||
# required = ["github.com/user/thing/cmd/thing"] |
||||
# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"] |
||||
# |
||||
# [[constraint]] |
||||
# name = "github.com/user/project" |
||||
# version = "1.0.0" |
||||
# |
||||
# [[constraint]] |
||||
# name = "github.com/user/project2" |
||||
# branch = "dev" |
||||
# source = "github.com/myfork/project2" |
||||
# |
||||
# [[override]] |
||||
# name = "github.com/x/y" |
||||
# version = "2.4.0" |
||||
# |
||||
# [prune] |
||||
# non-go = false |
||||
# go-tests = true |
||||
# unused-packages = true |
||||
|
||||
|
||||
[[constraint]] |
||||
name = "github.com/stretchr/testify" |
||||
version = "1.2.1" |
||||
|
||||
[prune] |
||||
go-tests = true |
||||
unused-packages = true |
@ -0,0 +1,21 @@ |
||||
MIT License |
||||
|
||||
Copyright (c) 2018 Ethan Koenig |
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
of this software and associated documentation files (the "Software"), to deal |
||||
in the Software without restriction, including without limitation the rights |
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
copies of the Software, and to permit persons to whom the Software is |
||||
furnished to do so, subject to the following conditions: |
||||
|
||||
The above copyright notice and this permission notice shall be included in all |
||||
copies or substantial portions of the Software. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
SOFTWARE. |
@ -0,0 +1,13 @@ |
||||
# rupture |
||||
|
||||
[![Build Status](https://travis-ci.org/ethantkoenig/rupture.svg?branch=master)](https://travis-ci.org/ethantkoenig/rupture) [![GoDoc](https://godoc.org/github.com/ethantkoenig/rupture?status.svg)](https://godoc.org/github.com/ethantkoenig/rupture) [![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve) |
||||
|
||||
An explosive companion to the [bleve indexing library](https://www.github.com/blevesearch/bleve) |
||||
|
||||
## Features |
||||
|
||||
`rupture` includes the following additions to `bleve`: |
||||
|
||||
- __Flushing batches__: Batches of operation which automatically flush to the underlying bleve index. |
||||
- __Sharded indices__: An index-like abstraction built on top of several underlying indices. Sharded indices provide lower write latencies for indices with large amounts of data. |
||||
- __Index metadata__: Track index version for easily managing migrations and schema changes. |
@ -0,0 +1,67 @@ |
||||
package rupture |
||||
|
||||
import ( |
||||
"github.com/blevesearch/bleve" |
||||
) |
||||
|
||||
// FlushingBatch is a batch of operations that automatically flushes to the
|
||||
// underlying index once it reaches a certain size.
|
||||
type FlushingBatch interface { |
||||
// Index adds the specified index operation batch, possibly triggering a
|
||||
// flush.
|
||||
Index(id string, data interface{}) error |
||||
// Remove adds the specified delete operation to the batch, possibly
|
||||
// triggering a flush.
|
||||
Delete(id string) error |
||||
// Flush flushes the batch's contents.
|
||||
Flush() error |
||||
} |
||||
|
||||
type singleIndexFlushingBatch struct { |
||||
maxBatchSize int |
||||
batch *bleve.Batch |
||||
index bleve.Index |
||||
} |
||||
|
||||
func newFlushingBatch(index bleve.Index, maxBatchSize int) *singleIndexFlushingBatch { |
||||
return &singleIndexFlushingBatch{ |
||||
maxBatchSize: maxBatchSize, |
||||
batch: index.NewBatch(), |
||||
index: index, |
||||
} |
||||
} |
||||
|
||||
// NewFlushingBatch creates a new flushing batch for the specified index. Once
|
||||
// the number of operations in the batch reaches the specified limit, the batch
|
||||
// automatically flushes its operations to the index.
|
||||
func NewFlushingBatch(index bleve.Index, maxBatchSize int) FlushingBatch { |
||||
return newFlushingBatch(index, maxBatchSize) |
||||
} |
||||
|
||||
func (b *singleIndexFlushingBatch) Index(id string, data interface{}) error { |
||||
if err := b.batch.Index(id, data); err != nil { |
||||
return err |
||||
} |
||||
return b.flushIfFull() |
||||
} |
||||
|
||||
func (b *singleIndexFlushingBatch) Delete(id string) error { |
||||
b.batch.Delete(id) |
||||
return b.flushIfFull() |
||||
} |
||||
|
||||
func (b *singleIndexFlushingBatch) flushIfFull() error { |
||||
if b.batch.Size() < b.maxBatchSize { |
||||
return nil |
||||
} |
||||
return b.Flush() |
||||
} |
||||
|
||||
func (b *singleIndexFlushingBatch) Flush() error { |
||||
err := b.index.Batch(b.batch) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
b.batch.Reset() |
||||
return nil |
||||
} |
@ -0,0 +1,68 @@ |
||||
package rupture |
||||
|
||||
import ( |
||||
"encoding/json" |
||||
"io/ioutil" |
||||
"os" |
||||
"path/filepath" |
||||
) |
||||
|
||||
const metaFilename = "rupture_meta.json" |
||||
|
||||
func indexMetadataPath(dir string) string { |
||||
return filepath.Join(dir, metaFilename) |
||||
} |
||||
|
||||
// IndexMetadata contains metadata about a bleve index.
|
||||
type IndexMetadata struct { |
||||
// The version of the data in the index. This can be useful for tracking
|
||||
// schema changes or data migrations.
|
||||
Version int `json:"version"` |
||||
} |
||||
|
||||
// in addition to the user-exposed metadata, we keep additional, internal-only
|
||||
// metadata for sharded indices.
|
||||
const shardedMetadataFilename = "rupture_sharded_meta.json" |
||||
|
||||
func shardedIndexMetadataPath(dir string) string { |
||||
return filepath.Join(dir, shardedMetadataFilename) |
||||
} |
||||
|
||||
type shardedIndexMetadata struct { |
||||
NumShards int `json:"num_shards"` |
||||
} |
||||
|
||||
func readJSON(path string, meta interface{}) error { |
||||
metaBytes, err := ioutil.ReadFile(path) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
return json.Unmarshal(metaBytes, meta) |
||||
} |
||||
|
||||
func writeJSON(path string, meta interface{}) error { |
||||
metaBytes, err := json.Marshal(meta) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
return ioutil.WriteFile(path, metaBytes, 0666) |
||||
} |
||||
|
||||
// ReadIndexMetadata returns the metadata for the index at the specified path.
|
||||
// If no such index metadata exists, an empty metadata and a nil error are
|
||||
// returned.
|
||||
func ReadIndexMetadata(path string) (*IndexMetadata, error) { |
||||
meta := &IndexMetadata{} |
||||
metaPath := indexMetadataPath(path) |
||||
if _, err := os.Stat(metaPath); os.IsNotExist(err) { |
||||
return meta, nil |
||||
} else if err != nil { |
||||
return nil, err |
||||
} |
||||
return meta, readJSON(metaPath, meta) |
||||
} |
||||
|
||||
// WriteIndexMetadata writes metadata for the index at the specified path.
|
||||
func WriteIndexMetadata(path string, meta *IndexMetadata) error { |
||||
return writeJSON(indexMetadataPath(path), meta) |
||||
} |
@ -0,0 +1,146 @@ |
||||
package rupture |
||||
|
||||
import ( |
||||
"fmt" |
||||
"hash/fnv" |
||||
"path/filepath" |
||||
"strconv" |
||||
|
||||
"github.com/blevesearch/bleve" |
||||
"github.com/blevesearch/bleve/document" |
||||
"github.com/blevesearch/bleve/mapping" |
||||
) |
||||
|
||||
// ShardedIndex an index that is built onto of multiple underlying bleve
|
||||
// indices (i.e. shards). Similar to bleve's index aliases, some methods may
|
||||
// not be supported.
|
||||
type ShardedIndex interface { |
||||
bleve.Index |
||||
shards() []bleve.Index |
||||
} |
||||
|
||||
// a type alias for bleve.Index, so that the anonymous field of
|
||||
// shardedIndex does not conflict with the Index(..) method.
|
||||
type bleveIndex bleve.Index |
||||
|
||||
type shardedIndex struct { |
||||
bleveIndex |
||||
indices []bleve.Index |
||||
} |
||||
|
||||
func hash(id string, n int) uint64 { |
||||
fnvHash := fnv.New64() |
||||
fnvHash.Write([]byte(id)) |
||||
return fnvHash.Sum64() % uint64(n) |
||||
} |
||||
|
||||
func childIndexerPath(rootPath string, i int) string { |
||||
return filepath.Join(rootPath, strconv.Itoa(i)) |
||||
} |
||||
|
||||
// NewShardedIndex creates a sharded index at the specified path, with the
|
||||
// specified mapping and number of shards.
|
||||
func NewShardedIndex(path string, mapping mapping.IndexMapping, numShards int) (ShardedIndex, error) { |
||||
if numShards <= 0 { |
||||
return nil, fmt.Errorf("Invalid number of shards: %d", numShards) |
||||
} |
||||
err := writeJSON(shardedIndexMetadataPath(path), &shardedIndexMetadata{NumShards: numShards}) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
s := &shardedIndex{ |
||||
indices: make([]bleve.Index, numShards), |
||||
} |
||||
for i := 0; i < numShards; i++ { |
||||
s.indices[i], err = bleve.New(childIndexerPath(path, i), mapping) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
s.bleveIndex = bleve.NewIndexAlias(s.indices...) |
||||
return s, nil |
||||
} |
||||
|
||||
// OpenShardedIndex opens a sharded index at the specified path.
|
||||
func OpenShardedIndex(path string) (ShardedIndex, error) { |
||||
var meta shardedIndexMetadata |
||||
var err error |
||||
if err = readJSON(shardedIndexMetadataPath(path), &meta); err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
s := &shardedIndex{ |
||||
indices: make([]bleve.Index, meta.NumShards), |
||||
} |
||||
for i := 0; i < meta.NumShards; i++ { |
||||
s.indices[i], err = bleve.Open(childIndexerPath(path, i)) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
s.bleveIndex = bleve.NewIndexAlias(s.indices...) |
||||
return s, nil |
||||
} |
||||
|
||||
func (s *shardedIndex) Index(id string, data interface{}) error { |
||||
return s.indices[hash(id, len(s.indices))].Index(id, data) |
||||
} |
||||
|
||||
func (s *shardedIndex) Delete(id string) error { |
||||
return s.indices[hash(id, len(s.indices))].Delete(id) |
||||
} |
||||
|
||||
func (s *shardedIndex) Document(id string) (*document.Document, error) { |
||||
return s.indices[hash(id, len(s.indices))].Document(id) |
||||
} |
||||
|
||||
func (s *shardedIndex) Close() error { |
||||
if err := s.bleveIndex.Close(); err != nil { |
||||
return err |
||||
} |
||||
for _, index := range s.indices { |
||||
if err := index.Close(); err != nil { |
||||
return err |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func (s *shardedIndex) shards() []bleve.Index { |
||||
return s.indices |
||||
} |
||||
|
||||
type shardedIndexFlushingBatch struct { |
||||
batches []*singleIndexFlushingBatch |
||||
} |
||||
|
||||
// NewShardedFlushingBatch creates a flushing batch with the specified batch
|
||||
// size for the specified sharded index.
|
||||
func NewShardedFlushingBatch(index ShardedIndex, maxBatchSize int) FlushingBatch { |
||||
indices := index.shards() |
||||
b := &shardedIndexFlushingBatch{ |
||||
batches: make([]*singleIndexFlushingBatch, len(indices)), |
||||
} |
||||
for i, index := range indices { |
||||
b.batches[i] = newFlushingBatch(index, maxBatchSize) |
||||
} |
||||
return b |
||||
} |
||||
|
||||
func (b *shardedIndexFlushingBatch) Index(id string, data interface{}) error { |
||||
return b.batches[hash(id, len(b.batches))].Index(id, data) |
||||
} |
||||
|
||||
func (b *shardedIndexFlushingBatch) Delete(id string) error { |
||||
return b.batches[hash(id, len(b.batches))].Delete(id) |
||||
} |
||||
|
||||
func (b *shardedIndexFlushingBatch) Flush() error { |
||||
for _, batch := range b.batches { |
||||
if err := batch.Flush(); err != nil { |
||||
return err |
||||
} |
||||
} |
||||
return nil |
||||
} |
Loading…
Reference in new issue