Vendor Update Go Libs (#13166)

* update github.com/alecthomas/chroma v0.8.0 -> v0.8.1

* github.com/blevesearch/bleve v1.0.10 -> v1.0.12

* editorconfig-core-go v2.1.1 -> v2.3.7

* github.com/gliderlabs/ssh v0.2.2 -> v0.3.1

* migrate editorconfig.ParseBytes to Parse

* github.com/shurcooL/vfsgen to 0d455de96546

* github.com/go-git/go-git/v5 v5.1.0 -> v5.2.0

* github.com/google/uuid v1.1.1 -> v1.1.2

* github.com/huandu/xstrings v1.3.0 -> v1.3.2

* github.com/klauspost/compress v1.10.11 -> v1.11.1

* github.com/markbates/goth v1.61.2 -> v1.65.0

* github.com/mattn/go-sqlite3 v1.14.0 -> v1.14.4

* github.com/mholt/archiver v3.3.0 -> v3.3.2

* github.com/microcosm-cc/bluemonday 4f7140c49acb -> v1.0.4

* github.com/minio/minio-go v7.0.4 -> v7.0.5

* github.com/olivere/elastic v7.0.9 -> v7.0.20

* github.com/urfave/cli v1.20.0 -> v1.22.4

* github.com/prometheus/client_golang v1.1.0 -> v1.8.0

* github.com/xanzy/go-gitlab v0.37.0 -> v0.38.1

* mvdan.cc/xurls v2.1.0 -> v2.2.0

Co-authored-by: Lauris BH <lauris@nix.lv>
tokarchuk/v1.17
6543 4 years ago committed by GitHub
parent 91f2afdb54
commit 12a1f914f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 65
      go.mod
  2. 581
      go.sum
  3. 6
      modules/context/repo.go
  4. 12
      vendor/cloud.google.com/go/compute/metadata/.repo-metadata.json
  5. 41
      vendor/cloud.google.com/go/compute/metadata/metadata.go
  6. 6
      vendor/github.com/RoaringBitmap/roaring/.travis.yml
  7. 70
      vendor/github.com/RoaringBitmap/roaring/README.md
  8. 35
      vendor/github.com/RoaringBitmap/roaring/arraycontainer.go
  9. 53
      vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go
  10. 117
      vendor/github.com/RoaringBitmap/roaring/fastaggregation.go
  11. 14
      vendor/github.com/RoaringBitmap/roaring/manyiterator.go
  12. 36
      vendor/github.com/RoaringBitmap/roaring/roaring.go
  13. 41
      vendor/github.com/RoaringBitmap/roaring/runcontainer.go
  14. 5
      vendor/github.com/RoaringBitmap/roaring/util.go
  15. 10
      vendor/github.com/alecthomas/chroma/lexers/a/awk.go
  16. 7
      vendor/github.com/alecthomas/chroma/lexers/e/elixir.go
  17. 2
      vendor/github.com/alecthomas/chroma/lexers/j/javascript.go
  18. 2
      vendor/github.com/alecthomas/chroma/lexers/j/jsx.go
  19. 59
      vendor/github.com/alecthomas/chroma/lexers/k/kotlin.go
  20. 3
      vendor/github.com/alecthomas/chroma/lexers/y/yaml.go
  21. 2
      vendor/github.com/andybalholm/brotli/README.md
  22. 36
      vendor/github.com/andybalholm/brotli/backward_references.go
  23. 37
      vendor/github.com/andybalholm/brotli/backward_references_hq.go
  24. 37
      vendor/github.com/andybalholm/brotli/block_splitter.go
  25. 3
      vendor/github.com/andybalholm/brotli/block_splitter_command.go
  26. 472
      vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
  27. 24
      vendor/github.com/andybalholm/brotli/command.go
  28. 365
      vendor/github.com/andybalholm/brotli/compress_fragment.go
  29. 250
      vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
  30. 929
      vendor/github.com/andybalholm/brotli/encode.go
  31. 13
      vendor/github.com/andybalholm/brotli/entropy_encode.go
  32. 16
      vendor/github.com/andybalholm/brotli/entropy_encode_static.go
  33. 29
      vendor/github.com/andybalholm/brotli/find_match_length.go
  34. 2
      vendor/github.com/andybalholm/brotli/go.mod
  35. 2
      vendor/github.com/andybalholm/brotli/go.sum
  36. 7
      vendor/github.com/andybalholm/brotli/histogram.go
  37. 192
      vendor/github.com/andybalholm/brotli/http.go
  38. 10
      vendor/github.com/andybalholm/brotli/memory.go
  39. 117
      vendor/github.com/andybalholm/brotli/metablock.go
  40. 5
      vendor/github.com/andybalholm/brotli/metablock_command.go
  41. 7
      vendor/github.com/andybalholm/brotli/metablock_distance.go
  42. 7
      vendor/github.com/andybalholm/brotli/metablock_literal.go
  43. 12
      vendor/github.com/andybalholm/brotli/ringbuffer.go
  44. 106
      vendor/github.com/andybalholm/brotli/write_bits.go
  45. 57
      vendor/github.com/andybalholm/brotli/writer.go
  46. 7
      vendor/github.com/anmitsu/go-shlex/README.md
  47. 3
      vendor/github.com/anmitsu/go-shlex/go.mod
  48. 7
      vendor/github.com/blevesearch/bleve/config.go
  49. 9
      vendor/github.com/blevesearch/bleve/go.mod
  50. 22
      vendor/github.com/blevesearch/bleve/index/scorch/scorch.go
  51. 2
      vendor/github.com/blevesearch/bleve/index/scorch/segment_plugin.go
  52. 27
      vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go
  53. 10
      vendor/github.com/blevesearch/bleve/index_impl.go
  54. 10
      vendor/github.com/blevesearch/bleve/search/highlight/format/html/html.go
  55. 2
      vendor/github.com/blevesearch/zap/v11/go.mod
  56. 2
      vendor/github.com/blevesearch/zap/v12/go.mod
  57. 2
      vendor/github.com/blevesearch/zap/v13/go.mod
  58. 2
      vendor/github.com/blevesearch/zap/v14/go.mod
  59. 12
      vendor/github.com/blevesearch/zap/v15/.gitignore
  60. 202
      vendor/github.com/blevesearch/zap/v15/LICENSE
  61. 158
      vendor/github.com/blevesearch/zap/v15/README.md
  62. 156
      vendor/github.com/blevesearch/zap/v15/build.go
  63. 67
      vendor/github.com/blevesearch/zap/v15/chunk.go
  64. 243
      vendor/github.com/blevesearch/zap/v15/contentcoder.go
  65. 61
      vendor/github.com/blevesearch/zap/v15/count.go
  66. 263
      vendor/github.com/blevesearch/zap/v15/dict.go
  67. 312
      vendor/github.com/blevesearch/zap/v15/docvalues.go
  68. 138
      vendor/github.com/blevesearch/zap/v15/enumerator.go
  69. 12
      vendor/github.com/blevesearch/zap/v15/go.mod
  70. 118
      vendor/github.com/blevesearch/zap/v15/intDecoder.go
  71. 206
      vendor/github.com/blevesearch/zap/v15/intcoder.go
  72. 852
      vendor/github.com/blevesearch/zap/v15/merge.go
  73. 860
      vendor/github.com/blevesearch/zap/v15/new.go
  74. 37
      vendor/github.com/blevesearch/zap/v15/plugin.go
  75. 801
      vendor/github.com/blevesearch/zap/v15/posting.go
  76. 43
      vendor/github.com/blevesearch/zap/v15/read.go
  77. 572
      vendor/github.com/blevesearch/zap/v15/segment.go
  78. 145
      vendor/github.com/blevesearch/zap/v15/write.go
  79. 177
      vendor/github.com/blevesearch/zap/v15/zap.md
  80. 8
      vendor/github.com/cespare/xxhash/v2/.travis.yml
  81. 22
      vendor/github.com/cespare/xxhash/v2/LICENSE.txt
  82. 67
      vendor/github.com/cespare/xxhash/v2/README.md
  83. 3
      vendor/github.com/cespare/xxhash/v2/go.mod
  84. 0
      vendor/github.com/cespare/xxhash/v2/go.sum
  85. 236
      vendor/github.com/cespare/xxhash/v2/xxhash.go
  86. 13
      vendor/github.com/cespare/xxhash/v2/xxhash_amd64.go
  87. 215
      vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s
  88. 76
      vendor/github.com/cespare/xxhash/v2/xxhash_other.go
  89. 15
      vendor/github.com/cespare/xxhash/v2/xxhash_safe.go
  90. 46
      vendor/github.com/cespare/xxhash/v2/xxhash_unsafe.go
  91. 21
      vendor/github.com/cpuguy83/go-md2man/v2/LICENSE.md
  92. 14
      vendor/github.com/cpuguy83/go-md2man/v2/md2man/md2man.go
  93. 345
      vendor/github.com/cpuguy83/go-md2man/v2/md2man/roff.go
  94. 9
      vendor/github.com/dlclark/regexp2/README.md
  95. 19
      vendor/github.com/dlclark/regexp2/runner.go
  96. 21
      vendor/github.com/dlclark/regexp2/syntax/parser.go
  97. 13
      vendor/github.com/editorconfig/editorconfig-core-go/v2/.editorconfig
  98. 19
      vendor/github.com/editorconfig/editorconfig-core-go/v2/.golangci.yml
  99. 74
      vendor/github.com/editorconfig/editorconfig-core-go/v2/.goreleaser.yml
  100. 30
      vendor/github.com/editorconfig/editorconfig-core-go/v2/.travis.yml
  101. Some files were not shown because too many files have changed in this diff Show More

@ -18,27 +18,30 @@ require (
gitea.com/macaron/session v0.0.0-20200902202411-e3a87877db6e
gitea.com/macaron/toolbox v0.0.0-20190822013122-05ff0fc766b7
github.com/PuerkitoBio/goquery v1.5.1
github.com/alecthomas/chroma v0.8.0
github.com/blevesearch/bleve v1.0.10
github.com/RoaringBitmap/roaring v0.5.1 // indirect
github.com/alecthomas/chroma v0.8.1
github.com/andybalholm/brotli v1.0.1 // indirect
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be // indirect
github.com/blevesearch/bleve v1.0.12
github.com/couchbase/gomemcached v0.0.0-20191004160342-7b5da2ec40b2 // indirect
github.com/cznic/b v0.0.0-20181122101859-a26611c4d92d // indirect
github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548 // indirect
github.com/cznic/strutil v0.0.0-20181122101858-275e90344537 // indirect
github.com/denisenkom/go-mssqldb v0.0.0-20200428022330-06a60b6afbbc
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/dlclark/regexp2 v1.2.1 // indirect
github.com/dlclark/regexp2 v1.4.0 // indirect
github.com/dustin/go-humanize v1.0.0
github.com/editorconfig/editorconfig-core-go/v2 v2.1.1
github.com/editorconfig/editorconfig-core-go/v2 v2.3.7
github.com/emirpasic/gods v1.12.0
github.com/ethantkoenig/rupture v0.0.0-20180203182544-0a76f03a811a
github.com/facebookgo/ensure v0.0.0-20160127193407-b4ab57deab51 // indirect
github.com/facebookgo/stack v0.0.0-20160209184415-751773369052 // indirect
github.com/facebookgo/subset v0.0.0-20150612182917-8dac2c3c4870 // indirect
github.com/gliderlabs/ssh v0.2.2
github.com/gliderlabs/ssh v0.3.1
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a // indirect
github.com/go-enry/go-enry/v2 v2.5.2
github.com/go-git/go-billy/v5 v5.0.0
github.com/go-git/go-git/v5 v5.1.0
github.com/go-git/go-git/v5 v5.2.0
github.com/go-ldap/ldap/v3 v3.2.4
github.com/go-redis/redis/v7 v7.4.0
github.com/go-sql-driver/mysql v1.5.0
@ -47,76 +50,80 @@ require (
github.com/gobwas/glob v0.2.3
github.com/gogs/chardet v0.0.0-20191104214054-4b6791f73a28
github.com/gogs/cron v0.0.0-20171120032916-9f6c956d3e14
github.com/golang/snappy v0.0.2 // indirect
github.com/google/go-github/v32 v32.1.0
github.com/google/uuid v1.1.1
github.com/google/uuid v1.1.2
github.com/gorilla/context v1.1.1
github.com/hashicorp/go-retryablehttp v0.6.7 // indirect
github.com/hashicorp/go-version v1.2.1
github.com/huandu/xstrings v1.3.0
github.com/huandu/xstrings v1.3.2
github.com/imdario/mergo v0.3.11 // indirect
github.com/issue9/assert v1.3.2 // indirect
github.com/issue9/identicon v1.0.1
github.com/jaytaylor/html2text v0.0.0-20160923191438-8fb95d837f7d
github.com/jmhodges/levigo v1.0.0 // indirect
github.com/kballard/go-shellquote v0.0.0-20170619183022-cd60e84ee657
github.com/keybase/go-crypto v0.0.0-20200123153347-de78d2cb44f4
github.com/klauspost/compress v1.10.11
github.com/klauspost/compress v1.11.1
github.com/klauspost/pgzip v1.2.5 // indirect
github.com/lafriks/xormstore v1.3.2
github.com/lib/pq v1.8.1-0.20200908161135-083382b7e6fc
github.com/lunny/dingtalk_webhook v0.0.0-20171025031554-e3534c89ef96
github.com/markbates/goth v1.61.2
github.com/mattn/go-colorable v0.1.7 // indirect
github.com/markbates/goth v1.65.0
github.com/mattn/go-isatty v0.0.12
github.com/mattn/go-runewidth v0.0.9 // indirect
github.com/mattn/go-sqlite3 v1.14.0
github.com/mattn/go-sqlite3 v1.14.4
github.com/mgechev/dots v0.0.0-20190921121421-c36f7dcfbb81
github.com/mgechev/revive v1.0.3-0.20200921231451-246eac737dc7
github.com/mholt/archiver/v3 v3.3.0
github.com/microcosm-cc/bluemonday v1.0.3-0.20191119130333-0a75d7616912
github.com/minio/minio-go/v7 v7.0.4
github.com/mholt/archiver/v3 v3.3.2
github.com/microcosm-cc/bluemonday v1.0.4
github.com/minio/minio-go/v7 v7.0.5
github.com/mitchellh/go-homedir v1.1.0
github.com/msteinert/pam v0.0.0-20151204160544-02ccfbfaf0cc
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646
github.com/niklasfasching/go-org v1.3.2
github.com/oliamb/cutter v0.2.2
github.com/olivere/elastic/v7 v7.0.9
github.com/olivere/elastic/v7 v7.0.20
github.com/pelletier/go-toml v1.8.1
github.com/pkg/errors v0.9.1
github.com/pquerna/otp v1.2.0
github.com/prometheus/client_golang v1.1.0
github.com/prometheus/procfs v0.0.4 // indirect
github.com/prometheus/client_golang v1.8.0
github.com/quasoft/websspi v1.0.0
github.com/remyoudompheng/bigfft v0.0.0-20190321074620-2f0d2b0e0001 // indirect
github.com/sergi/go-diff v1.1.0
github.com/shurcooL/httpfs v0.0.0-20190527155220-6a4d4a70508b // indirect
github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd
github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749 // indirect
github.com/shurcooL/vfsgen v0.0.0-20200824052919-0d455de96546
github.com/stretchr/testify v1.6.1
github.com/syndtr/goleveldb v1.0.0
github.com/tecbot/gorocksdb v0.0.0-20181010114359-8752a9433481 // indirect
github.com/tinylib/msgp v1.1.2 // indirect
github.com/tstranex/u2f v1.0.0
github.com/ulikunitz/xz v0.5.8 // indirect
github.com/unknwon/com v1.0.1
github.com/unknwon/i18n v0.0.0-20200823051745-09abd91c7f2c
github.com/unknwon/paginater v0.0.0-20151104151617-7748a72e0141
github.com/urfave/cli v1.20.0
github.com/xanzy/go-gitlab v0.37.0
github.com/urfave/cli v1.22.4
github.com/willf/bitset v1.1.11 // indirect
github.com/xanzy/go-gitlab v0.38.1
github.com/yohcop/openid-go v1.0.0
github.com/yuin/goldmark v1.2.1
github.com/yuin/goldmark-highlighting v0.0.0-20200307114337-60d527fdb691
github.com/yuin/goldmark-meta v0.0.0-20191126180153-f0638e958b60
go.jolheiser.com/hcaptcha v0.0.4
go.jolheiser.com/pwn v0.0.3
golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a
golang.org/x/net v0.0.0-20200904194848-62affa334b73
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
golang.org/x/sys v0.0.0-20200918174421-af09f7315aff
golang.org/x/crypto v0.0.0-20201012173705-84dcc777aaee
golang.org/x/net v0.0.0-20201010224723-4f7140c49acb
golang.org/x/oauth2 v0.0.0-20200902213428-5d25da1a8d43
golang.org/x/sys v0.0.0-20201015000850-e3ed0017c211
golang.org/x/text v0.3.3
golang.org/x/time v0.0.0-20200630173020-3af7569d3a1e // indirect
golang.org/x/tools v0.0.0-20200921210052-fa0125251cc4
golang.org/x/tools v0.0.0-20200929161345-d7fc70abf50f
google.golang.org/appengine v1.6.7 // indirect
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
gopkg.in/ini.v1 v1.61.0
gopkg.in/ini.v1 v1.62.0
gopkg.in/yaml.v2 v2.3.0
mvdan.cc/xurls/v2 v2.1.0
mvdan.cc/xurls/v2 v2.2.0
strk.kbt.io/projects/go/libravatar v0.0.0-20191008002943-06d1c002b251
xorm.io/builder v0.3.7
xorm.io/xorm v1.0.5

581
go.sum

File diff suppressed because it is too large Load Diff

@ -222,11 +222,7 @@ func (r *Repository) GetEditorconfig() (*editorconfig.Editorconfig, error) {
return nil, err
}
defer reader.Close()
data, err := ioutil.ReadAll(reader)
if err != nil {
return nil, err
}
return editorconfig.ParseBytes(data)
return editorconfig.Parse(reader)
}
// RetrieveBaseRepo retrieves base repository

@ -1,12 +0,0 @@
{
"name": "metadata",
"name_pretty": "Google Compute Engine Metadata API",
"product_documentation": "https://cloud.google.com/compute/docs/storing-retrieving-metadata",
"client_documentation": "https://godoc.org/cloud.google.com/go/compute/metadata",
"release_level": "ga",
"language": "go",
"repo": "googleapis/google-cloud-go",
"distribution_name": "cloud.google.com/go/compute/metadata",
"api_id": "compute:metadata",
"requires_billing": false
}

@ -61,25 +61,14 @@ var (
instID = &cachedValue{k: "instance/id", trim: true}
)
var (
defaultClient = &Client{hc: &http.Client{
var defaultClient = &Client{hc: &http.Client{
Transport: &http.Transport{
Dial: (&net.Dialer{
Timeout: 2 * time.Second,
KeepAlive: 30 * time.Second,
}).Dial,
ResponseHeaderTimeout: 2 * time.Second,
},
}}
subscribeClient = &Client{hc: &http.Client{
Transport: &http.Transport{
Dial: (&net.Dialer{
Timeout: 2 * time.Second,
KeepAlive: 30 * time.Second,
}).Dial,
},
}}
)
// NotDefinedError is returned when requested metadata is not defined.
//
@ -151,7 +140,7 @@ func testOnGCE() bool {
}()
go func() {
addrs, err := net.LookupHost("metadata.google.internal")
addrs, err := net.DefaultResolver.LookupHost(ctx, "metadata.google.internal")
if err != nil || len(addrs) == 0 {
resc <- false
return
@ -206,10 +195,9 @@ func systemInfoSuggestsGCE() bool {
return name == "Google" || name == "Google Compute Engine"
}
// Subscribe calls Client.Subscribe on a client designed for subscribing (one with no
// ResponseHeaderTimeout).
// Subscribe calls Client.Subscribe on the default client.
func Subscribe(suffix string, fn func(v string, ok bool) error) error {
return subscribeClient.Subscribe(suffix, fn)
return defaultClient.Subscribe(suffix, fn)
}
// Get calls Client.Get on the default client.
@ -280,9 +268,14 @@ type Client struct {
hc *http.Client
}
// NewClient returns a Client that can be used to fetch metadata. All HTTP requests
// will use the given http.Client instead of the default client.
// NewClient returns a Client that can be used to fetch metadata.
// Returns the client that uses the specified http.Client for HTTP requests.
// If nil is specified, returns the default client.
func NewClient(c *http.Client) *Client {
if c == nil {
return defaultClient
}
return &Client{hc: c}
}
@ -303,8 +296,12 @@ func (c *Client) getETag(suffix string) (value, etag string, err error) {
// being stable anyway.
host = metadataIP
}
suffix = strings.TrimLeft(suffix, "/")
u := "http://" + host + "/computeMetadata/v1/" + suffix
req, _ := http.NewRequest("GET", u, nil)
req, err := http.NewRequest("GET", u, nil)
if err != nil {
return "", "", err
}
req.Header.Set("Metadata-Flavor", "Google")
req.Header.Set("User-Agent", userAgent)
res, err := c.hc.Do(req)
@ -407,11 +404,7 @@ func (c *Client) InstanceTags() ([]string, error) {
// InstanceName returns the current VM's instance ID string.
func (c *Client) InstanceName() (string, error) {
host, err := c.Hostname()
if err != nil {
return "", err
}
return strings.Split(host, ".")[0], nil
return c.getTrimmed("instance/name")
}
// Zone returns the current VM's zone, such as "us-central1-b".

@ -8,13 +8,9 @@ install:
notifications:
email: false
go:
- "1.7.x"
- "1.8.x"
- "1.9.x"
- "1.10.x"
- "1.11.x"
- "1.12.x"
- "1.13.x"
- "1.14.x"
- tip
# whitelist

@ -1,5 +1,9 @@
roaring [![Build Status](https://travis-ci.org/RoaringBitmap/roaring.png)](https://travis-ci.org/RoaringBitmap/roaring) [![Coverage Status](https://coveralls.io/repos/github/RoaringBitmap/roaring/badge.svg?branch=master)](https://coveralls.io/github/RoaringBitmap/roaring?branch=master) [![GoDoc](https://godoc.org/github.com/RoaringBitmap/roaring?status.svg)](https://godoc.org/github.com/RoaringBitmap/roaring) [![Go Report Card](https://goreportcard.com/badge/RoaringBitmap/roaring)](https://goreportcard.com/report/github.com/RoaringBitmap/roaring)
roaring [![Build Status](https://travis-ci.org/RoaringBitmap/roaring.png)](https://travis-ci.org/RoaringBitmap/roaring) [![GoDoc](https://godoc.org/github.com/RoaringBitmap/roaring?status.svg)](https://godoc.org/github.com/RoaringBitmap/roaring) [![GoDoc](https://godoc.org/github.com/RoaringBitmap/roaring/roaring64?status.svg)](https://godoc.org/github.com/RoaringBitmap/roaring/roaring64) [![Go Report Card](https://goreportcard.com/badge/RoaringBitmap/roaring)](https://goreportcard.com/report/github.com/RoaringBitmap/roaring)
[![Build Status](https://cloud.drone.io/api/badges/RoaringBitmap/roaring/status.svg)](https://cloud.drone.io/RoaringBitmap/roaring)
![Go-CI](https://github.com/RoaringBitmap/roaring/workflows/Go-CI/badge.svg)
![Go-ARM-CI](https://github.com/RoaringBitmap/roaring/workflows/Go-ARM-CI/badge.svg)
![Go-Windows-CI](https://github.com/RoaringBitmap/roaring/workflows/Go-Windows-CI/badge.svg)
![Go-macos-CI](https://github.com/RoaringBitmap/roaring/workflows/Go-macos-CI/badge.svg)
=============
This is a go version of the Roaring bitmap data structure.
@ -7,7 +11,7 @@ This is a go version of the Roaring bitmap data structure.
Roaring bitmaps are used by several major systems such as [Apache Lucene][lucene] and derivative systems such as [Solr][solr] and
[Elasticsearch][elasticsearch], [Apache Druid (Incubating)][druid], [LinkedIn Pinot][pinot], [Netflix Atlas][atlas], [Apache Spark][spark], [OpenSearchServer][opensearchserver], [Cloud Torrent][cloudtorrent], [Whoosh][whoosh], [Pilosa][pilosa], [Microsoft Visual Studio Team Services (VSTS)][vsts], and eBay's [Apache Kylin][kylin].
[Elasticsearch][elasticsearch], [Apache Druid (Incubating)][druid], [LinkedIn Pinot][pinot], [Netflix Atlas][atlas], [Apache Spark][spark], [OpenSearchServer][opensearchserver], [Cloud Torrent][cloudtorrent], [Whoosh][whoosh], [Pilosa][pilosa], [Microsoft Visual Studio Team Services (VSTS)][vsts], and eBay's [Apache Kylin][kylin]. The YouTube SQL Engine, [Google Procella](https://research.google/pubs/pub48388/), uses Roaring bitmaps for indexing.
[lucene]: https://lucene.apache.org/
[solr]: https://lucene.apache.org/solr/
@ -172,10 +176,70 @@ That is, given a fixed overhead for the universe size (x), Roaring
bitmaps never use more than 2 bytes per integer. You can call
``BoundSerializedSizeInBytes`` for a more precise estimate.
### 64-bit Roaring
By default, roaring is used to stored unsigned 32-bit integers. However, we also offer
an extension dedicated to 64-bit integers. It supports roughly the same functions:
```go
package main
import (
"fmt"
"github.com/RoaringBitmap/roaring/roaring64"
"bytes"
)
func main() {
// example inspired by https://github.com/fzandona/goroar
fmt.Println("==roaring64==")
rb1 := roaring64.BitmapOf(1, 2, 3, 4, 5, 100, 1000)
fmt.Println(rb1.String())
rb2 := roaring64.BitmapOf(3, 4, 1000)
fmt.Println(rb2.String())
rb3 := roaring64.New()
fmt.Println(rb3.String())
fmt.Println("Cardinality: ", rb1.GetCardinality())
fmt.Println("Contains 3? ", rb1.Contains(3))
rb1.And(rb2)
rb3.Add(1)
rb3.Add(5)
rb3.Or(rb1)
// prints 1, 3, 4, 5, 1000
i := rb3.Iterator()
for i.HasNext() {
fmt.Println(i.Next())
}
fmt.Println()
// next we include an example of serialization
buf := new(bytes.Buffer)
rb1.WriteTo(buf) // we omit error handling
newrb:= roaring64.New()
newrb.ReadFrom(buf)
if rb1.Equals(newrb) {
fmt.Println("I wrote the content to a byte stream and read it back.")
}
// you can iterate over bitmaps using ReverseIterator(), Iterator, ManyIterator()
}
```
Only the 32-bit roaring format is standard and cross-operable between Java, C++, C and Go. There is no guarantee that the 64-bit versions are compatible.
### Documentation
Current documentation is available at http://godoc.org/github.com/RoaringBitmap/roaring
Current documentation is available at http://godoc.org/github.com/RoaringBitmap/roaring and http://godoc.org/github.com/RoaringBitmap/roaring64
### Goroutine safety

@ -876,6 +876,41 @@ func (ac *arrayContainer) loadData(bitmapContainer *bitmapContainer) {
ac.content = make([]uint16, bitmapContainer.cardinality, bitmapContainer.cardinality)
bitmapContainer.fillArray(ac.content)
}
func (ac *arrayContainer) resetTo(a container) {
switch x := a.(type) {
case *arrayContainer:
ac.realloc(len(x.content))
copy(ac.content, x.content)
case *bitmapContainer:
ac.realloc(x.cardinality)
x.fillArray(ac.content)
case *runContainer16:
card := int(x.cardinality())
ac.realloc(card)
cur := 0
for _, r := range x.iv {
for val := r.start; val <= r.last(); val++ {
ac.content[cur] = val
cur++
}
}
default:
panic("unsupported container type")
}
}
func (ac *arrayContainer) realloc(size int) {
if cap(ac.content) < size {
ac.content = make([]uint16, size)
} else {
ac.content = ac.content[:size]
}
}
func newArrayContainer() *arrayContainer {
p := new(arrayContainer)
return p

@ -203,6 +203,33 @@ func (bcmi *bitmapContainerManyIterator) nextMany(hs uint32, buf []uint32) int {
return n
}
func (bcmi *bitmapContainerManyIterator) nextMany64(hs uint64, buf []uint64) int {
n := 0
base := bcmi.base
bitset := bcmi.bitset
for n < len(buf) {
if bitset == 0 {
base++
if base >= len(bcmi.ptr.bitmap) {
bcmi.base = base
bcmi.bitset = bitset
return n
}
bitset = bcmi.ptr.bitmap[base]
continue
}
t := bitset & -bitset
buf[n] = uint64(((base * 64) + int(popcount(t-1)))) | hs
n = n + 1
bitset ^= t
}
bcmi.base = base
bcmi.bitset = bitset
return n
}
func newBitmapContainerManyIterator(a *bitmapContainer) *bitmapContainerManyIterator {
return &bitmapContainerManyIterator{a, -1, 0}
}
@ -934,6 +961,32 @@ func (bc *bitmapContainer) loadData(arrayContainer *arrayContainer) {
}
}
func (bc *bitmapContainer) resetTo(a container) {
switch x := a.(type) {
case *arrayContainer:
fill(bc.bitmap, 0)
bc.loadData(x)
case *bitmapContainer:
bc.cardinality = x.cardinality
copy(bc.bitmap, x.bitmap)
case *runContainer16:
bc.cardinality = len(x.iv)
lastEnd := 0
for _, r := range x.iv {
bc.cardinality += int(r.length)
resetBitmapRange(bc.bitmap, lastEnd, int(r.start))
lastEnd = int(r.start+r.length) + 1
setBitmapRange(bc.bitmap, int(r.start), lastEnd)
}
resetBitmapRange(bc.bitmap, lastEnd, maxCapacity)
default:
panic("unsupported container type")
}
}
func (bc *bitmapContainer) toArrayContainer() *arrayContainer {
ac := &arrayContainer{}
ac.loadData(bc)

@ -213,3 +213,120 @@ func HeapXor(bitmaps ...*Bitmap) *Bitmap {
}
return heap.Pop(&pq).(*item).value
}
// AndAny provides a result equivalent to x1.And(FastOr(bitmaps)).
// It's optimized to minimize allocations. It also might be faster than separate calls.
func (x1 *Bitmap) AndAny(bitmaps ...*Bitmap) {
if len(bitmaps) == 0 {
return
} else if len(bitmaps) == 1 {
x1.And(bitmaps[0])
return
}
type withPos struct {
bitmap *roaringArray
pos int
key uint16
}
filters := make([]withPos, 0, len(bitmaps))
for _, b := range bitmaps {
if b.highlowcontainer.size() > 0 {
filters = append(filters, withPos{
bitmap: &b.highlowcontainer,
pos: 0,
key: b.highlowcontainer.getKeyAtIndex(0),
})
}
}
basePos := 0
intersections := 0
keyContainers := make([]container, 0, len(filters))
var (
tmpArray *arrayContainer
tmpBitmap *bitmapContainer
minNextKey uint16
)
for basePos < x1.highlowcontainer.size() && len(filters) > 0 {
baseKey := x1.highlowcontainer.getKeyAtIndex(basePos)
// accumulate containers for current key, find next minimal key in filters
// and exclude filters that do not have related values anymore
i := 0
maxPossibleOr := 0
minNextKey = MaxUint16
for _, f := range filters {
if f.key < baseKey {
f.pos = f.bitmap.advanceUntil(baseKey, f.pos)
if f.pos == f.bitmap.size() {
continue
}
f.key = f.bitmap.getKeyAtIndex(f.pos)
}
if f.key == baseKey {
cont := f.bitmap.getContainerAtIndex(f.pos)
keyContainers = append(keyContainers, cont)
maxPossibleOr += cont.getCardinality()
f.pos++
if f.pos == f.bitmap.size() {
continue
}
f.key = f.bitmap.getKeyAtIndex(f.pos)
}
minNextKey = minOfUint16(minNextKey, f.key)
filters[i] = f
i++
}
filters = filters[:i]
if len(keyContainers) == 0 {
basePos = x1.highlowcontainer.advanceUntil(minNextKey, basePos)
continue
}
var ored container
if len(keyContainers) == 1 {
ored = keyContainers[0]
} else {
//TODO: special case for run containers?
if maxPossibleOr > arrayDefaultMaxSize {
if tmpBitmap == nil {
tmpBitmap = newBitmapContainer()
}
tmpBitmap.resetTo(keyContainers[0])
for _, c := range keyContainers[1:] {
tmpBitmap.ior(c)
}
ored = tmpBitmap
} else {
if tmpArray == nil {
tmpArray = newArrayContainerCapacity(maxPossibleOr)
}
tmpArray.realloc(maxPossibleOr)
tmpArray.resetTo(keyContainers[0])
for _, c := range keyContainers[1:] {
tmpArray.ior(c)
}
ored = tmpArray
}
}
result := x1.highlowcontainer.getWritableContainerAtIndex(basePos).iand(ored)
if result.getCardinality() > 0 {
x1.highlowcontainer.replaceKeyAndContainerAtIndex(intersections, baseKey, result, false)
intersections++
}
keyContainers = keyContainers[:0]
basePos = x1.highlowcontainer.advanceUntil(minNextKey, basePos)
}
x1.highlowcontainer.resize(intersections)
}

@ -2,6 +2,7 @@ package roaring
type manyIterable interface {
nextMany(hs uint32, buf []uint32) int
nextMany64(hs uint64, buf []uint64) int
}
func (si *shortIterator) nextMany(hs uint32, buf []uint32) int {
@ -16,3 +17,16 @@ func (si *shortIterator) nextMany(hs uint32, buf []uint32) int {
si.loc = l
return n
}
func (si *shortIterator) nextMany64(hs uint64, buf []uint64) int {
n := 0
l := si.loc
s := si.slice
for n < len(buf) && l < len(s) {
buf[n] = uint64(s[l]) | hs
l++
n++
}
si.loc = l
return n
}

@ -346,7 +346,9 @@ func newIntReverseIterator(a *Bitmap) *intReverseIterator {
// ManyIntIterable allows you to iterate over the values in a Bitmap
type ManyIntIterable interface {
// pass in a buffer to fill up with values, returns how many values were returned
NextMany([]uint32) int
NextMany(buf []uint32) int
// pass in a buffer to fill up with 64 bit values, returns how many values were returned
NextMany64(hs uint64, buf []uint64) int
}
type manyIntIterator struct {
@ -382,6 +384,25 @@ func (ii *manyIntIterator) NextMany(buf []uint32) int {
return n
}
func (ii *manyIntIterator) NextMany64(hs64 uint64, buf []uint64) int {
n := 0
for n < len(buf) {
if ii.iter == nil {
break
}
hs := uint64(ii.hs) | hs64
moreN := ii.iter.nextMany64(hs, buf[n:])
n += moreN
if moreN == 0 {
ii.pos = ii.pos + 1
ii.init()
}
}
return n
}
func newManyIntIterator(a *Bitmap) *manyIntIterator {
p := new(manyIntIterator)
p.pos = 0
@ -678,7 +699,10 @@ func (rb *Bitmap) GetCardinality() uint64 {
return size
}
// Rank returns the number of integers that are smaller or equal to x (Rank(infinity) would be GetCardinality())
// Rank returns the number of integers that are smaller or equal to x (Rank(infinity) would be GetCardinality()).
// If you pass the smallest value, you get the value 1. If you pass a value that is smaller than the smallest
// value, you get 0. Note that this function differs in convention from the Select function since it
// return 1 and not 0 on the smallest value.
func (rb *Bitmap) Rank(x uint32) uint64 {
size := uint64(0)
for i := 0; i < rb.highlowcontainer.size(); i++ {
@ -695,7 +719,9 @@ func (rb *Bitmap) Rank(x uint32) uint64 {
return size
}
// Select returns the xth integer in the bitmap
// Select returns the xth integer in the bitmap. If you pass 0, you get
// the smallest element. Note that this function differs in convention from
// the Rank function which returns 1 on the smallest value.
func (rb *Bitmap) Select(x uint32) (uint32, error) {
if rb.GetCardinality() <= uint64(x) {
return 0, fmt.Errorf("can't find %dth integer in a bitmap with only %d items", x, rb.GetCardinality())
@ -1555,3 +1581,7 @@ func (rb *Bitmap) Stats() Statistics {
}
return stats
}
func (rb *Bitmap) FillLeastSignificant32bits(x []uint64, i uint64, mask uint64) {
rb.ManyIterator().NextMany64(mask, x[i:])
}

@ -1321,6 +1321,47 @@ func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int {
return n
}
func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int {
n := 0
if !ri.hasNext() {
return n
}
// start and end are inclusive
for n < len(buf) {
moreVals := 0
if ri.rc.iv[ri.curIndex].length >= ri.curPosInIndex {
// add as many as you can from this seq
moreVals = minOfInt(int(ri.rc.iv[ri.curIndex].length-ri.curPosInIndex)+1, len(buf)-n)
base := uint64(ri.rc.iv[ri.curIndex].start+ri.curPosInIndex) | hs
// allows BCE
buf2 := buf[n : n+moreVals]
for i := range buf2 {
buf2[i] = base + uint64(i)
}
// update values
n += moreVals
}
if moreVals+int(ri.curPosInIndex) > int(ri.rc.iv[ri.curIndex].length) {
ri.curPosInIndex = 0
ri.curIndex++
if ri.curIndex == int64(len(ri.rc.iv)) {
break
}
} else {
ri.curPosInIndex += uint16(moreVals) //moreVals always fits in uint16
}
}
return n
}
// remove removes key from the container.
func (rc *runContainer16) removeKey(key uint16) (wasPresent bool) {

@ -1,6 +1,7 @@
package roaring
import (
"math"
"math/rand"
"sort"
)
@ -15,7 +16,7 @@ const (
noOffsetThreshold = 4
// MaxUint32 is the largest uint32 value.
MaxUint32 = 4294967295
MaxUint32 = math.MaxUint32
// MaxRange is One more than the maximum allowed bitmap bit index. For use as an upper
// bound for ranges.
@ -23,7 +24,7 @@ const (
// MaxUint16 is the largest 16 bit unsigned int.
// This is the largest value an interval16 can store.
MaxUint16 = 65535
MaxUint16 = math.MaxUint16
// Compute wordSizeInBytes, the size of a word in bytes.
_m = ^uint64(0)

@ -30,14 +30,14 @@ var Awk = internal.Register(MustNewLexer(
"root": {
{`^(?=\s|/)`, Text, Push("slashstartsregex")},
Include("commentsandwhitespace"),
{`\+\+|--|\|\||&&|in\b|\$|!?~|(\*\*|[-<>+*%\^/!=|])=?`, Operator, Push("slashstartsregex")},
{`\+\+|--|\|\||&&|in\b|\$|!?~|\|&|(\*\*|[-<>+*%\^/!=|])=?`, Operator, Push("slashstartsregex")},
{`[{(\[;,]`, Punctuation, Push("slashstartsregex")},
{`[})\].]`, Punctuation, nil},
{`(break|continue|do|while|exit|for|if|else|return)\b`, Keyword, Push("slashstartsregex")},
{`(break|continue|do|while|exit|for|if|else|return|switch|case|default)\b`, Keyword, Push("slashstartsregex")},
{`function\b`, KeywordDeclaration, Push("slashstartsregex")},
{`(atan2|cos|exp|int|log|rand|sin|sqrt|srand|gensub|gsub|index|length|match|split|sprintf|sub|substr|tolower|toupper|close|fflush|getline|next|nextfile|print|printf|strftime|systime|delete|system)\b`, KeywordReserved, nil},
{`(ARGC|ARGIND|ARGV|BEGIN|CONVFMT|ENVIRON|END|ERRNO|FIELDWIDTHS|FILENAME|FNR|FS|IGNORECASE|NF|NR|OFMT|OFS|ORFS|RLENGTH|RS|RSTART|RT|SUBSEP)\b`, NameBuiltin, nil},
{`[$a-zA-Z_]\w*`, NameOther, nil},
{`(atan2|cos|exp|int|log|rand|sin|sqrt|srand|gensub|gsub|index|length|match|split|patsplit|sprintf|sub|substr|tolower|toupper|close|fflush|getline|next(file)|print|printf|strftime|systime|mktime|delete|system|strtonum|and|compl|lshift|or|rshift|asorti?|isarray|bindtextdomain|dcn?gettext|@(include|load|namespace))\b`, KeywordReserved, nil},
{`(ARGC|ARGIND|ARGV|BEGIN(FILE)?|BINMODE|CONVFMT|ENVIRON|END(FILE)?|ERRNO|FIELDWIDTHS|FILENAME|FNR|FPAT|FS|IGNORECASE|LINT|NF|NR|OFMT|OFS|ORS|PROCINFO|RLENGTH|RS|RSTART|RT|SUBSEP|TEXTDOMAIN)\b`, NameBuiltin, nil},
{`[@$a-zA-Z_]\w*`, NameOther, nil},
{`[0-9][0-9]*\.[0-9]+([eE][0-9]+)?[fd]?`, LiteralNumberFloat, nil},
{`0x[0-9a-fA-F]+`, LiteralNumberHex, nil},
{`[0-9]+`, LiteralNumberInteger, nil},

@ -28,6 +28,13 @@ var Elixir = internal.Register(MustNewLexer(
{`:"`, LiteralStringSymbol, Push("string_double_atom")},
{`:'`, LiteralStringSymbol, Push("string_single_atom")},
{`((?:\.\.\.|<<>>|%\{\}|%|\{\})|(?:(?:\.\.\.|[a-z_]\w*[!?]?)|[A-Z]\w*(?:\.[A-Z]\w*)*|(?:\<\<\<|\>\>\>|\|\|\||\&\&\&|\^\^\^|\~\~\~|\=\=\=|\!\=\=|\~\>\>|\<\~\>|\|\~\>|\<\|\>|\=\=|\!\=|\<\=|\>\=|\&\&|\|\||\<\>|\+\+|\-\-|\|\>|\=\~|\-\>|\<\-|\||\.|\=|\~\>|\<\~|\<|\>|\+|\-|\*|\/|\!|\^|\&)))(:)(?=\s|\n)`, ByGroups(LiteralStringSymbol, Punctuation), nil},
{`(fn|do|end|after|else|rescue|catch)\b`, Keyword, nil},
{`(not|and|or|when|in)\b`, OperatorWord, nil},
{`(case|cond|for|if|unless|try|receive|raise|quote|unquote|unquote_splicing|throw|super|while)\b`, Keyword, nil},
{`(def|defp|defmodule|defprotocol|defmacro|defmacrop|defdelegate|defexception|defstruct|defimpl|defcallback)\b`, KeywordDeclaration, nil},
{`(import|require|use|alias)\b`, KeywordNamespace, nil},
{`(nil|true|false)\b`, NameConstant, nil},
{`(_|__MODULE__|__DIR__|__ENV__|__CALLER__)\b`, NamePseudo, nil},
{`@(?:\.\.\.|[a-z_]\w*[!?]?)`, NameAttribute, nil},
{`(?:\.\.\.|[a-z_]\w*[!?]?)`, Name, nil},
{`(%?)([A-Z]\w*(?:\.[A-Z]\w*)*)`, ByGroups(Punctuation, NameClass), nil},

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -24,32 +24,71 @@ var Kotlin = internal.Register(MustNewLexer(
{`//[^\n]*\n?`, CommentSingle, nil},
{`/[*].*?[*]/`, CommentMultiline, nil},
{`\n`, Text, nil},
{`::|!!|\?[:.]`, Operator, nil},
{`[~!%^&*()+=|\[\]:;,.<>/?-]`, Punctuation, nil},
{`!==|!in|!is|===`, Operator, nil},
{`%=|&&|\*=|\+\+|\+=|--|-=|->|\.\.|\/=|::|<=|==|>=|!!|!=|\|\||\?[:.]`, Operator, nil},
{`[~!%^&*()+=|\[\]:;,.<>\/?-]`, Punctuation, nil},
{`[{}]`, Punctuation, nil},
{`"""[^"]*"""`, LiteralString, nil},
{`"(\\\\|\\"|[^"\n])*["\n]`, LiteralString, nil},
{`"""`, LiteralString, Push("rawstring")},
{`"`, LiteralStringDouble, Push("string")},
{`(')(\\u[0-9a-fA-F]{4})(')`, ByGroups(LiteralStringChar, LiteralStringEscape, LiteralStringChar), nil},
{`'\\.'|'[^\\]'`, LiteralStringChar, nil},
{`0[xX][0-9a-fA-F]+[Uu]?[Ll]?|[0-9]+(\.[0-9]*)?([eE][+-][0-9]+)?[fF]?[Uu]?[Ll]?`, LiteralNumber, nil},
{`(companion)(\s+)(object)`, ByGroups(Keyword, Text, Keyword), nil},
{`(class|interface|object)(\s+)`, ByGroups(Keyword, Text), Push("class")},
{`(package|import)(\s+)`, ByGroups(Keyword, Text), Push("package")},
{`(val|var)(\s+)`, ByGroups(Keyword, Text), Push("property")},
{`(fun)(\s+)(<[^>]*>\s+)?`, ByGroups(Keyword, Text, Text), Push("function")},
{`(abstract|actual|annotation|as|break|by|catch|class|companion|const|constructor|continue|crossinline|data|do|dynamic|else|enum|expect|external|false|final|finally|for|fun|get|if|import|in|infix|inline|inner|interface|internal|is|lateinit|noinline|null|object|open|operator|out|override|package|private|protected|public|reified|return|sealed|set|super|suspend|tailrec|this|throw|true|try|val|var|vararg|when|where|while)\b`, Keyword, nil},
{"(@?[" + kotlinIdentifier + "]*`)", Name, nil},
{`(fun)(\s+)`, ByGroups(Keyword, Text), Push("function")},
{`(abstract|actual|annotation|as|as\?|break|by|catch|class|companion|const|constructor|continue|crossinline|data|delegate|do|dynamic|else|enum|expect|external|false|field|file|final|finally|for|fun|get|if|import|in|infix|init|inline|inner|interface|internal|is|it|lateinit|noinline|null|object|open|operator|out|override|package|param|private|property|protected|public|receiver|reified|return|sealed|set|setparam|super|suspend|tailrec|this|throw|true|try|typealias|typeof|val|var|vararg|when|where|while)\b`, Keyword, nil},
{`@[` + kotlinIdentifier + `]+`, NameDecorator, nil},
{`[` + kotlinIdentifier + `]+`, Name, nil},
},
"package": {
{`\S+`, NameNamespace, Pop(1)},
},
"class": {
{"(@?[" + kotlinIdentifier + "]*`)", NameClass, Pop(1)},
// \x60 is the back tick character (`)
{`\x60[^\x60]+?\x60`, NameClass, Pop(1)},
{`[` + kotlinIdentifier + `]+`, NameClass, Pop(1)},
},
"property": {
{"(@?[" + kotlinIdentifier + " ]*`)", NameProperty, Pop(1)},
{`\x60[^\x60]+?\x60`, NameProperty, Pop(1)},
{`[` + kotlinIdentifier + `]+`, NameProperty, Pop(1)},
},
"generics-specification": {
{`<`, Punctuation, Push("generics-specification")}, // required for generics inside generics e.g. <T : List<Int> >
{`>`, Punctuation, Pop(1)},
{`[,:*?]`, Punctuation, nil},
{`(in|out|reified)`, Keyword, nil},
{`\x60[^\x60]+?\x60`, NameClass, nil},
{`[` + kotlinIdentifier + `]+`, NameClass, nil},
{`\s+`, Text, nil},
},
"function": {
{"(@?[" + kotlinIdentifier + " ]*`)", NameFunction, Pop(1)},
{`<`, Punctuation, Push("generics-specification")},
{`\x60[^\x60]+?\x60`, NameFunction, Pop(1)},
{`[` + kotlinIdentifier + `]+`, NameFunction, Pop(1)},
{`\s+`, Text, nil},
},
"rawstring": {
// raw strings don't allow character escaping
{`"""`, LiteralString, Pop(1)},
{`(?:[^$"]+|\"{1,2}[^"])+`, LiteralString, nil},
Include("string-interpol"),
// remaining dollar signs are just a string
{`\$`, LiteralString, nil},
},
"string": {
{`\\[tbnr'"\\\$]`, LiteralStringEscape, nil},
{`\\u[0-9a-fA-F]{4}`, LiteralStringEscape, nil},
{`"`, LiteralStringDouble, Pop(1)},
Include("string-interpol"),
{`[^\n\\"$]+`, LiteralStringDouble, nil},
// remaining dollar signs are just a string
{`\$`, LiteralStringDouble, nil},
},
"string-interpol": {
{`\$[` + kotlinIdentifier + `]+`, LiteralStringInterpol, nil},
{`\${[^}\n]*}`, LiteralStringInterpol, nil},
},
},
))

@ -23,13 +23,13 @@ var YAML = internal.Register(MustNewLexer(
{`&[^\s]+`, CommentPreproc, nil},
{`\*[^\s]+`, CommentPreproc, nil},
{`^%include\s+[^\n\r]+`, CommentPreproc, nil},
{`[>|](?:[+-])?\s(?:^(?:[ \n]{1})+.*\n?)*$`, StringDoc, nil},
Include("key"),
Include("value"),
{`[?:,\[\]]`, Punctuation, nil},
{`.`, Text, nil},
},
"value": {
{`([>|](?:[+-])?)(\n(^ {1,})(?:.*\n*(?:^\3 *).*)*)`, ByGroups(Punctuation, StringDoc, Whitespace), nil},
{Words(``, `\b`, "true", "True", "TRUE", "false", "False", "FALSE", "null",
"y", "Y", "yes", "Yes", "YES", "n", "N", "no", "No", "NO",
"on", "On", "ON", "off", "Off", "OFF"), KeywordConstant, nil},
@ -37,6 +37,7 @@ var YAML = internal.Register(MustNewLexer(
{`'(?:\\.|[^'])*'`, StringSingle, nil},
{`\d\d\d\d-\d\d-\d\d([T ]\d\d:\d\d:\d\d(\.\d+)?(Z|\s+[-+]\d+)?)?`, LiteralDate, nil},
{`\b[+\-]?(0x[\da-f]+|0o[0-7]+|(\d+\.?\d*|\.?\d+)(e[\+\-]?\d+)?|\.inf|\.nan)\b`, Number, nil},
{`([^\{\}\[\]\?,\:\!\-\*&\@].*)( )+(#.*)`, ByGroups(Literal, Whitespace, Comment), nil},
{`[^\{\}\[\]\?,\:\!\-\*&\@].*`, Literal, nil},
},
"key": {

@ -3,3 +3,5 @@ It was translated from the reference implementation (https://github.com/google/b
with the `c2go` tool at https://github.com/andybalholm/c2go.
I am using it in production with https://github.com/andybalholm/redwood.
API documentation is found at https://pkg.go.dev/github.com/andybalholm/brotli?tab=doc.

@ -1,5 +1,9 @@
package brotli
import (
"sync"
)
/* Copyright 2013 Google Inc. All Rights Reserved.
Distributed under MIT license.
@ -31,13 +35,10 @@ func computeDistanceCode(distance uint, max_distance uint, dist_cache []int) uin
return distance + numDistanceShortCodes - 1
}
/* "commands" points to the next output command to write to, "*num_commands" is
initially the total amount of commands output by previous
CreateBackwardReferences calls, and must be incremented by the amount written
by this call. */
func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher hasherHandle, dist_cache []int, last_insert_len *uint, commands []command, num_commands *uint, num_literals *uint) {
var hasherSearchResultPool sync.Pool
func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher hasherHandle, dist_cache []int, last_insert_len *uint, commands *[]command, num_literals *uint) {
var max_backward_limit uint = maxBackwardLimit(params.lgwin)
var orig_commands []command = commands
var insert_length uint = *last_insert_len
var pos_end uint = position + num_bytes
var store_end uint
@ -57,8 +58,14 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
/* Minimum score to accept a backward reference. */
hasher.PrepareDistanceCache(dist_cache)
var sr2 hasherSearchResult
var sr hasherSearchResult
sr2, _ := hasherSearchResultPool.Get().(*hasherSearchResult)
if sr2 == nil {
sr2 = &hasherSearchResult{}
}
sr, _ := hasherSearchResultPool.Get().(*hasherSearchResult)
if sr == nil {
sr = &hasherSearchResult{}
}
for position+hasher.HashTypeLength() < pos_end {
var max_length uint = pos_end - position
@ -67,7 +74,7 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
sr.len_code_delta = 0
sr.distance = 0
sr.score = kMinScore
hasher.FindLongestMatch(&params.dictionary, ringbuffer, ringbuffer_mask, dist_cache, position, max_length, max_distance, gap, params.dist.max_distance, &sr)
hasher.FindLongestMatch(&params.dictionary, ringbuffer, ringbuffer_mask, dist_cache, position, max_length, max_distance, gap, params.dist.max_distance, sr)
if sr.score > kMinScore {
/* Found a match. Let's look for something even better ahead. */
var delayed_backward_references_in_row int = 0
@ -83,14 +90,14 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
sr2.distance = 0
sr2.score = kMinScore
max_distance = brotli_min_size_t(position+1, max_backward_limit)
hasher.FindLongestMatch(&params.dictionary, ringbuffer, ringbuffer_mask, dist_cache, position+1, max_length, max_distance, gap, params.dist.max_distance, &sr2)
hasher.FindLongestMatch(&params.dictionary, ringbuffer, ringbuffer_mask, dist_cache, position+1, max_length, max_distance, gap, params.dist.max_distance, sr2)
if sr2.score >= sr.score+cost_diff_lazy {
/* Ok, let's just write one byte for now and start a match from the
next byte. */
position++
insert_length++
sr = sr2
*sr = *sr2
delayed_backward_references_in_row++
if delayed_backward_references_in_row < 4 && position+hasher.HashTypeLength() < pos_end {
continue
@ -114,8 +121,7 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
hasher.PrepareDistanceCache(dist_cache)
}
initCommand(&commands[0], &params.dist, insert_length, sr.len, sr.len_code_delta, distance_code)
commands = commands[1:]
*commands = append(*commands, makeCommand(&params.dist, insert_length, sr.len, sr.len_code_delta, distance_code))
}
*num_literals += insert_length
@ -173,5 +179,7 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
insert_length += pos_end - position
*last_insert_len = insert_length
*num_commands += uint(-cap(commands) + cap(orig_commands))
hasherSearchResultPool.Put(sr)
hasherSearchResultPool.Put(sr2)
}

@ -123,14 +123,13 @@ func setCost(histogram []uint32, histogram_size uint, literal_histogram bool, co
}
}
func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbuffer []byte, ringbuffer_mask uint, commands []command, num_commands uint, last_insert_len uint) {
func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbuffer []byte, ringbuffer_mask uint, commands []command, last_insert_len uint) {
var histogram_literal [numLiteralSymbols]uint32
var histogram_cmd [numCommandSymbols]uint32
var histogram_dist [maxEffectiveDistanceAlphabetSize]uint32
var cost_literal [numLiteralSymbols]float32
var pos uint = position - last_insert_len
var min_cost_cmd float32 = kInfinity
var i uint
var cost_cmd []float32 = self.cost_cmd_[:]
var literal_costs []float32
@ -138,7 +137,7 @@ func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbu
histogram_cmd = [numCommandSymbols]uint32{}
histogram_dist = [maxEffectiveDistanceAlphabetSize]uint32{}
for i = 0; i < num_commands; i++ {
for i := range commands {
var inslength uint = uint(commands[i].insert_len_)
var copylength uint = uint(commandCopyLen(&commands[i]))
var distcode uint = uint(commands[i].dist_prefix_) & 0x3FF
@ -161,7 +160,7 @@ func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbu
setCost(histogram_cmd[:], numCommandSymbols, false, cost_cmd)
setCost(histogram_dist[:], uint(self.distance_histogram_size), false, self.cost_dist_)
for i = 0; i < numCommandSymbols; i++ {
for i := 0; i < numCommandSymbols; i++ {
min_cost_cmd = brotli_min_float(min_cost_cmd, cost_cmd[i])
}
@ -169,10 +168,10 @@ func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbu
{
literal_costs = self.literal_costs_
var literal_carry float32 = 0.0
var num_bytes uint = self.num_bytes_
num_bytes := int(self.num_bytes_)
literal_costs[0] = 0.0
for i = 0; i < num_bytes; i++ {
literal_carry += cost_literal[ringbuffer[(position+i)&ringbuffer_mask]]
for i := 0; i < num_bytes; i++ {
literal_carry += cost_literal[ringbuffer[(position+uint(i))&ringbuffer_mask]]
literal_costs[i+1] = literal_costs[i] + literal_carry
literal_carry -= literal_costs[i+1] - literal_costs[i]
}
@ -502,7 +501,9 @@ func updateNodes(num_bytes uint, block_start uint, pos uint, ringbuffer []byte,
var cost float32 = dist_cost + float32(getCopyExtra(copycode)) + zopfliCostModelGetCommandCost(model, cmdcode)
if cost < nodes[pos+len].u.cost {
updateZopfliNode(nodes, pos, start, uint(len), len_code, dist, 0, cost)
result = brotli_max_size_t(result, uint(len))
if len > result {
result = len
}
}
}
}
@ -530,7 +531,7 @@ func computeShortestPathFromNodes(num_bytes uint, nodes []zopfliNode) uint {
}
/* REQUIRES: nodes != NULL and len(nodes) >= num_bytes + 1 */
func zopfliCreateCommands(num_bytes uint, block_start uint, nodes []zopfliNode, dist_cache []int, last_insert_len *uint, params *encoderParams, commands []command, num_literals *uint) {
func zopfliCreateCommands(num_bytes uint, block_start uint, nodes []zopfliNode, dist_cache []int, last_insert_len *uint, params *encoderParams, commands *[]command, num_literals *uint) {
var max_backward_limit uint = maxBackwardLimit(params.lgwin)
var pos uint = 0
var offset uint32 = nodes[0].u.next
@ -552,7 +553,7 @@ func zopfliCreateCommands(num_bytes uint, block_start uint, nodes []zopfliNode,
var max_distance uint = brotli_min_size_t(block_start+pos, max_backward_limit)
var is_dictionary bool = (distance > max_distance+gap)
var dist_code uint = uint(zopfliNodeDistanceCode(next))
initCommand(&commands[i], &params.dist, insert_length, copy_length, int(len_code)-int(copy_length), dist_code)
*commands = append(*commands, makeCommand(&params.dist, insert_length, copy_length, int(len_code)-int(copy_length), dist_code))
if !is_dictionary && dist_code > 0 {
dist_cache[3] = dist_cache[2]
@ -679,16 +680,16 @@ func zopfliComputeShortestPath(num_bytes uint, position uint, ringbuffer []byte,
return computeShortestPathFromNodes(num_bytes, nodes)
}
func createZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher *h10, dist_cache []int, last_insert_len *uint, commands []command, num_commands *uint, num_literals *uint) {
func createZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher *h10, dist_cache []int, last_insert_len *uint, commands *[]command, num_literals *uint) {
var nodes []zopfliNode
nodes = make([]zopfliNode, (num_bytes + 1))
initZopfliNodes(nodes, num_bytes+1)
*num_commands += zopfliComputeShortestPath(num_bytes, position, ringbuffer, ringbuffer_mask, params, dist_cache, hasher, nodes)
zopfliComputeShortestPath(num_bytes, position, ringbuffer, ringbuffer_mask, params, dist_cache, hasher, nodes)
zopfliCreateCommands(num_bytes, position, nodes, dist_cache, last_insert_len, params, commands, num_literals)
nodes = nil
}
func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher hasherHandle, dist_cache []int, last_insert_len *uint, commands []command, num_commands *uint, num_literals *uint) {
func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher hasherHandle, dist_cache []int, last_insert_len *uint, commands *[]command, num_literals *uint) {
var max_backward_limit uint = maxBackwardLimit(params.lgwin)
var num_matches []uint32 = make([]uint32, num_bytes)
var matches_size uint = 4 * num_bytes
@ -703,7 +704,7 @@ func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer
var orig_num_literals uint
var orig_last_insert_len uint
var orig_dist_cache [4]int
var orig_num_commands uint
var orig_num_commands int
var model zopfliCostModel
var nodes []zopfliNode
var matches []backwardMatch = make([]backwardMatch, matches_size)
@ -769,7 +770,7 @@ func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer
orig_num_literals = *num_literals
orig_last_insert_len = *last_insert_len
copy(orig_dist_cache[:], dist_cache[:4])
orig_num_commands = *num_commands
orig_num_commands = len(*commands)
nodes = make([]zopfliNode, (num_bytes + 1))
initZopfliCostModel(&model, &params.dist, num_bytes)
for i = 0; i < 2; i++ {
@ -777,14 +778,14 @@ func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer
if i == 0 {
zopfliCostModelSetFromLiteralCosts(&model, position, ringbuffer, ringbuffer_mask)
} else {
zopfliCostModelSetFromCommands(&model, position, ringbuffer, ringbuffer_mask, commands, *num_commands-orig_num_commands, orig_last_insert_len)
zopfliCostModelSetFromCommands(&model, position, ringbuffer, ringbuffer_mask, (*commands)[orig_num_commands:], orig_last_insert_len)
}
*num_commands = orig_num_commands
*commands = (*commands)[:orig_num_commands]
*num_literals = orig_num_literals
*last_insert_len = orig_last_insert_len
copy(dist_cache, orig_dist_cache[:4])
*num_commands += zopfliIterate(num_bytes, position, ringbuffer, ringbuffer_mask, params, gap, dist_cache, &model, num_matches, matches, nodes)
zopfliIterate(num_bytes, position, ringbuffer, ringbuffer_mask, params, gap, dist_cache, &model, num_matches, matches, nodes)
zopfliCreateCommands(num_bytes, position, nodes, dist_cache, last_insert_len, params, commands, num_literals)
}

@ -33,23 +33,21 @@ const (
kMinItersForRefining uint = 100
)
func countLiterals(cmds []command, num_commands uint) uint {
func countLiterals(cmds []command) uint {
var total_length uint = 0
/* Count how many we have. */
var i uint
for i = 0; i < num_commands; i++ {
for i := range cmds {
total_length += uint(cmds[i].insert_len_)
}
return total_length
}
func copyLiteralsToByteArray(cmds []command, num_commands uint, data []byte, offset uint, mask uint, literals []byte) {
func copyLiteralsToByteArray(cmds []command, data []byte, offset uint, mask uint, literals []byte) {
var pos uint = 0
var from_pos uint = offset & mask
var i uint
for i = 0; i < num_commands; i++ {
for i := range cmds {
var insert_len uint = uint(cmds[i].insert_len_)
if from_pos+insert_len > mask {
var head_size uint = mask + 1 - from_pos
@ -90,24 +88,19 @@ const clustersPerBatch = 16
func initBlockSplit(self *blockSplit) {
self.num_types = 0
self.num_blocks = 0
self.types = nil
self.lengths = nil
self.types = self.types[:0]
self.lengths = self.lengths[:0]
self.types_alloc_size = 0
self.lengths_alloc_size = 0
}
func destroyBlockSplit(self *blockSplit) {
self.types = nil
self.lengths = nil
}
func splitBlock(cmds []command, num_commands uint, data []byte, pos uint, mask uint, params *encoderParams, literal_split *blockSplit, insert_and_copy_split *blockSplit, dist_split *blockSplit) {
func splitBlock(cmds []command, data []byte, pos uint, mask uint, params *encoderParams, literal_split *blockSplit, insert_and_copy_split *blockSplit, dist_split *blockSplit) {
{
var literals_count uint = countLiterals(cmds, num_commands)
var literals_count uint = countLiterals(cmds)
var literals []byte = make([]byte, literals_count)
/* Create a continuous array of literals. */
copyLiteralsToByteArray(cmds, num_commands, data, pos, mask, literals)
copyLiteralsToByteArray(cmds, data, pos, mask, literals)
/* Create the block split on the array of literals.
Literal histograms have alphabet size 256. */
@ -116,28 +109,26 @@ func splitBlock(cmds []command, num_commands uint, data []byte, pos uint, mask u
literals = nil
}
{
var insert_and_copy_codes []uint16 = make([]uint16, num_commands)
var insert_and_copy_codes []uint16 = make([]uint16, len(cmds))
/* Compute prefix codes for commands. */
var i uint
for i = 0; i < num_commands; i++ {
for i := range cmds {
insert_and_copy_codes[i] = cmds[i].cmd_prefix_
}
/* Create the block split on the array of command prefixes. */
splitByteVectorCommand(insert_and_copy_codes, num_commands, kSymbolsPerCommandHistogram, kMaxCommandHistograms, kCommandStrideLength, kCommandBlockSwitchCost, params, insert_and_copy_split)
splitByteVectorCommand(insert_and_copy_codes, kSymbolsPerCommandHistogram, kMaxCommandHistograms, kCommandStrideLength, kCommandBlockSwitchCost, params, insert_and_copy_split)
/* TODO: reuse for distances? */
insert_and_copy_codes = nil
}
{
var distance_prefixes []uint16 = make([]uint16, num_commands)
var distance_prefixes []uint16 = make([]uint16, len(cmds))
var j uint = 0
/* Create a continuous array of distance prefixes. */
var i uint
for i = 0; i < num_commands; i++ {
for i := range cmds {
var cmd *command = &cmds[i]
if commandCopyLen(cmd) != 0 && cmd.cmd_prefix_ >= 128 {
distance_prefixes[j] = cmd.dist_prefix_ & 0x3FF

@ -372,7 +372,8 @@ func clusterBlocksCommand(data []uint16, length uint, num_blocks uint, block_ids
histogram_symbols = nil
}
func splitByteVectorCommand(data []uint16, length uint, literals_per_histogram uint, max_histograms uint, sampling_stride_length uint, block_switch_cost float64, params *encoderParams, split *blockSplit) {
func splitByteVectorCommand(data []uint16, literals_per_histogram uint, max_histograms uint, sampling_stride_length uint, block_switch_cost float64, params *encoderParams, split *blockSplit) {
length := uint(len(data))
var data_size uint = histogramDataSizeCommand()
var num_histograms uint = length/literals_per_histogram + 1
var histograms []histogramCommand

File diff suppressed because it is too large Load Diff

@ -194,26 +194,28 @@ type command struct {
}
/* distance_code is e.g. 0 for same-as-last short code, or 16 for offset 1. */
func initCommand(self *command, dist *distanceParams, insertlen uint, copylen uint, copylen_code_delta int, distance_code uint) {
func makeCommand(dist *distanceParams, insertlen uint, copylen uint, copylen_code_delta int, distance_code uint) (cmd command) {
/* Don't rely on signed int representation, use honest casts. */
var delta uint32 = uint32(byte(int8(copylen_code_delta)))
self.insert_len_ = uint32(insertlen)
self.copy_len_ = uint32(uint32(copylen) | delta<<25)
cmd.insert_len_ = uint32(insertlen)
cmd.copy_len_ = uint32(uint32(copylen) | delta<<25)
/* The distance prefix and extra bits are stored in this Command as if
npostfix and ndirect were 0, they are only recomputed later after the
clustering if needed. */
prefixEncodeCopyDistance(distance_code, uint(dist.num_direct_distance_codes), uint(dist.distance_postfix_bits), &self.dist_prefix_, &self.dist_extra_)
prefixEncodeCopyDistance(distance_code, uint(dist.num_direct_distance_codes), uint(dist.distance_postfix_bits), &cmd.dist_prefix_, &cmd.dist_extra_)
getLengthCode(insertlen, uint(int(copylen)+copylen_code_delta), (cmd.dist_prefix_&0x3FF == 0), &cmd.cmd_prefix_)
getLengthCode(insertlen, uint(int(copylen)+copylen_code_delta), (self.dist_prefix_&0x3FF == 0), &self.cmd_prefix_)
return cmd
}
func initInsertCommand(self *command, insertlen uint) {
self.insert_len_ = uint32(insertlen)
self.copy_len_ = 4 << 25
self.dist_extra_ = 0
self.dist_prefix_ = numDistanceShortCodes
getLengthCode(insertlen, 4, false, &self.cmd_prefix_)
func makeInsertCommand(insertlen uint) (cmd command) {
cmd.insert_len_ = uint32(insertlen)
cmd.copy_len_ = 4 << 25
cmd.dist_extra_ = 0
cmd.dist_prefix_ = numDistanceShortCodes
getLengthCode(insertlen, 4, false, &cmd.cmd_prefix_)
return cmd
}
func commandRestoreDistanceCode(self *command, dist *distanceParams) uint32 {

@ -33,14 +33,8 @@ func hashBytesAtOffset5(v uint64, offset int, shift uint) uint32 {
}
func isMatch5(p1 []byte, p2 []byte) bool {
var i int
for i = 0; i < 5; i++ {
if p1[i] != p2[i] {
return false
}
}
return true
return binary.LittleEndian.Uint32(p1) == binary.LittleEndian.Uint32(p2) &&
p1[4] == p2[4]
}
/* Builds a literal prefix code into "depths" and "bits" based on the statistics
@ -51,7 +45,7 @@ func isMatch5(p1 []byte, p2 []byte) bool {
and thus have to assign a non-zero depth for each literal.
Returns estimated compression ratio millibytes/char for encoding given input
with generated code. */
func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte, bits []uint16, storage_ix *uint, storage []byte) uint {
func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte, bits []uint16, bw *bitWriter) uint {
var histogram = [256]uint32{0}
var histogram_total uint
var i uint
@ -88,7 +82,7 @@ func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte
}
buildAndStoreHuffmanTreeFast(histogram[:], histogram_total, /* max_bits = */
8, depths, bits, storage_ix, storage)
8, depths, bits, bw)
{
var literal_ratio uint = 0
for i = 0; i < 256; i++ {
@ -104,7 +98,7 @@ func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte
/* Builds a command and distance prefix code (each 64 symbols) into "depth" and
"bits" based on "histogram" and stores it into the bit stream. */
func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []uint16, bw *bitWriter) {
var tree [129]huffmanTree
var cmd_depth = [numCommandSymbols]byte{0}
/* Tree size for building a tree over 64 symbols is 2 * 64 + 1. */
@ -151,141 +145,141 @@ func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []ui
cmd_depth[448+8*i] = depth[56+i]
}
storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], storage_ix, storage)
storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], bw)
}
storeHuffmanTree(depth[64:], 64, tree[:], storage_ix, storage)
storeHuffmanTree(depth[64:], 64, tree[:], bw)
}
/* REQUIRES: insertlen < 6210 */
func emitInsertLen1(insertlen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
func emitInsertLen1(insertlen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
if insertlen < 6 {
var code uint = insertlen + 40
writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
bw.writeBits(uint(depth[code]), uint64(bits[code]))
histo[code]++
} else if insertlen < 130 {
var tail uint = insertlen - 2
var nbits uint32 = log2FloorNonZero(tail) - 1
var prefix uint = tail >> nbits
var inscode uint = uint((nbits << 1) + uint32(prefix) + 42)
writeBits(uint(depth[inscode]), uint64(bits[inscode]), storage_ix, storage)
writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
bw.writeBits(uint(depth[inscode]), uint64(bits[inscode]))
bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
histo[inscode]++
} else if insertlen < 2114 {
var tail uint = insertlen - 66
var nbits uint32 = log2FloorNonZero(tail)
var code uint = uint(nbits + 50)
writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
bw.writeBits(uint(depth[code]), uint64(bits[code]))
bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
histo[code]++
} else {
writeBits(uint(depth[61]), uint64(bits[61]), storage_ix, storage)
writeBits(12, uint64(insertlen)-2114, storage_ix, storage)
bw.writeBits(uint(depth[61]), uint64(bits[61]))
bw.writeBits(12, uint64(insertlen)-2114)
histo[61]++
}
}
func emitLongInsertLen(insertlen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
func emitLongInsertLen(insertlen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
if insertlen < 22594 {
writeBits(uint(depth[62]), uint64(bits[62]), storage_ix, storage)
writeBits(14, uint64(insertlen)-6210, storage_ix, storage)
bw.writeBits(uint(depth[62]), uint64(bits[62]))
bw.writeBits(14, uint64(insertlen)-6210)
histo[62]++
} else {
writeBits(uint(depth[63]), uint64(bits[63]), storage_ix, storage)
writeBits(24, uint64(insertlen)-22594, storage_ix, storage)
bw.writeBits(uint(depth[63]), uint64(bits[63]))
bw.writeBits(24, uint64(insertlen)-22594)
histo[63]++
}
}
func emitCopyLen1(copylen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
func emitCopyLen1(copylen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
if copylen < 10 {
writeBits(uint(depth[copylen+14]), uint64(bits[copylen+14]), storage_ix, storage)
bw.writeBits(uint(depth[copylen+14]), uint64(bits[copylen+14]))
histo[copylen+14]++
} else if copylen < 134 {
var tail uint = copylen - 6
var nbits uint32 = log2FloorNonZero(tail) - 1
var prefix uint = tail >> nbits
var code uint = uint((nbits << 1) + uint32(prefix) + 20)
writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
bw.writeBits(uint(depth[code]), uint64(bits[code]))
bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
histo[code]++
} else if copylen < 2118 {
var tail uint = copylen - 70
var nbits uint32 = log2FloorNonZero(tail)
var code uint = uint(nbits + 28)
writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
bw.writeBits(uint(depth[code]), uint64(bits[code]))
bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
histo[code]++
} else {
writeBits(uint(depth[39]), uint64(bits[39]), storage_ix, storage)
writeBits(24, uint64(copylen)-2118, storage_ix, storage)
bw.writeBits(uint(depth[39]), uint64(bits[39]))
bw.writeBits(24, uint64(copylen)-2118)
histo[39]++
}
}
func emitCopyLenLastDistance1(copylen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
func emitCopyLenLastDistance1(copylen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
if copylen < 12 {
writeBits(uint(depth[copylen-4]), uint64(bits[copylen-4]), storage_ix, storage)
bw.writeBits(uint(depth[copylen-4]), uint64(bits[copylen-4]))
histo[copylen-4]++
} else if copylen < 72 {
var tail uint = copylen - 8
var nbits uint32 = log2FloorNonZero(tail) - 1
var prefix uint = tail >> nbits
var code uint = uint((nbits << 1) + uint32(prefix) + 4)
writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
bw.writeBits(uint(depth[code]), uint64(bits[code]))
bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
histo[code]++
} else if copylen < 136 {
var tail uint = copylen - 8
var code uint = (tail >> 5) + 30
writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
writeBits(5, uint64(tail)&31, storage_ix, storage)
writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
bw.writeBits(uint(depth[code]), uint64(bits[code]))
bw.writeBits(5, uint64(tail)&31)
bw.writeBits(uint(depth[64]), uint64(bits[64]))
histo[code]++
histo[64]++
} else if copylen < 2120 {
var tail uint = copylen - 72
var nbits uint32 = log2FloorNonZero(tail)
var code uint = uint(nbits + 28)
writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
bw.writeBits(uint(depth[code]), uint64(bits[code]))
bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
bw.writeBits(uint(depth[64]), uint64(bits[64]))
histo[code]++
histo[64]++
} else {
writeBits(uint(depth[39]), uint64(bits[39]), storage_ix, storage)
writeBits(24, uint64(copylen)-2120, storage_ix, storage)
writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
bw.writeBits(uint(depth[39]), uint64(bits[39]))
bw.writeBits(24, uint64(copylen)-2120)
bw.writeBits(uint(depth[64]), uint64(bits[64]))
histo[39]++
histo[64]++
}
}
func emitDistance1(distance uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
func emitDistance1(distance uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
var d uint = distance + 3
var nbits uint32 = log2FloorNonZero(d) - 1
var prefix uint = (d >> nbits) & 1
var offset uint = (2 + prefix) << nbits
var distcode uint = uint(2*(nbits-1) + uint32(prefix) + 80)
writeBits(uint(depth[distcode]), uint64(bits[distcode]), storage_ix, storage)
writeBits(uint(nbits), uint64(d)-uint64(offset), storage_ix, storage)
bw.writeBits(uint(depth[distcode]), uint64(bits[distcode]))
bw.writeBits(uint(nbits), uint64(d)-uint64(offset))
histo[distcode]++
}
func emitLiterals(input []byte, len uint, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
func emitLiterals(input []byte, len uint, depth []byte, bits []uint16, bw *bitWriter) {
var j uint
for j = 0; j < len; j++ {
var lit byte = input[j]
writeBits(uint(depth[lit]), uint64(bits[lit]), storage_ix, storage)
bw.writeBits(uint(depth[lit]), uint64(bits[lit]))
}
}
/* REQUIRES: len <= 1 << 24. */
func storeMetaBlockHeader1(len uint, is_uncompressed bool, storage_ix *uint, storage []byte) {
func storeMetaBlockHeader1(len uint, is_uncompressed bool, bw *bitWriter) {
var nibbles uint = 6
/* ISLAST */
writeBits(1, 0, storage_ix, storage)
bw.writeBits(1, 0)
if len <= 1<<16 {
nibbles = 4
@ -293,34 +287,11 @@ func storeMetaBlockHeader1(len uint, is_uncompressed bool, storage_ix *uint, sto
nibbles = 5
}
writeBits(2, uint64(nibbles)-4, storage_ix, storage)
writeBits(nibbles*4, uint64(len)-1, storage_ix, storage)
bw.writeBits(2, uint64(nibbles)-4)
bw.writeBits(nibbles*4, uint64(len)-1)
/* ISUNCOMPRESSED */
writeSingleBit(is_uncompressed, storage_ix, storage)
}
func updateBits(n_bits uint, bits uint32, pos uint, array []byte) {
for n_bits > 0 {
var byte_pos uint = pos >> 3
var n_unchanged_bits uint = pos & 7
var n_changed_bits uint = brotli_min_size_t(n_bits, 8-n_unchanged_bits)
var total_bits uint = n_unchanged_bits + n_changed_bits
var mask uint32 = (^((1 << total_bits) - 1)) | ((1 << n_unchanged_bits) - 1)
var unchanged_bits uint32 = uint32(array[byte_pos]) & mask
var changed_bits uint32 = bits & ((1 << n_changed_bits) - 1)
array[byte_pos] = byte(changed_bits<<n_unchanged_bits | unchanged_bits)
n_bits -= n_changed_bits
bits >>= n_changed_bits
pos += n_changed_bits
}
}
func rewindBitPosition1(new_storage_ix uint, storage_ix *uint, storage []byte) {
var bitpos uint = new_storage_ix & 7
var mask uint = (1 << bitpos) - 1
storage[new_storage_ix>>3] &= byte(mask)
*storage_ix = new_storage_ix
bw.writeSingleBit(is_uncompressed)
}
var shouldMergeBlock_kSampleRate uint = 43
@ -351,151 +322,26 @@ func shouldUseUncompressedMode(metablock_start []byte, next_emit []byte, insertl
}
}
func emitUncompressedMetaBlock1(begin []byte, end []byte, storage_ix_start uint, storage_ix *uint, storage []byte) {
var len uint = uint(-cap(end) + cap(begin))
rewindBitPosition1(storage_ix_start, storage_ix, storage)
storeMetaBlockHeader1(uint(len), true, storage_ix, storage)
*storage_ix = (*storage_ix + 7) &^ 7
copy(storage[*storage_ix>>3:], begin[:len])
*storage_ix += uint(len << 3)
storage[*storage_ix>>3] = 0
func emitUncompressedMetaBlock1(data []byte, storage_ix_start uint, bw *bitWriter) {
bw.rewind(storage_ix_start)
storeMetaBlockHeader1(uint(len(data)), true, bw)
bw.jumpToByteBoundary()
bw.writeBytes(data)
}
var kCmdHistoSeed = [128]uint32{
0,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
0,
0,
0,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
0,
0,
0,
0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0,
}
var compressFragmentFastImpl_kFirstBlockSize uint = 3 << 15
var compressFragmentFastImpl_kMergeBlockSize uint = 1 << 16
func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []int, table_bits uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, storage_ix *uint, storage []byte) {
func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []int, table_bits uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, bw *bitWriter) {
var cmd_histo [128]uint32
var ip_end int
var next_emit int = 0
@ -506,7 +352,7 @@ func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []
var metablock_start int = input
var block_size uint = brotli_min_size_t(input_size, compressFragmentFastImpl_kFirstBlockSize)
var total_block_size uint = block_size
var mlen_storage_ix uint = *storage_ix + 3
var mlen_storage_ix uint = bw.getPos() + 3
var lit_depth [256]byte
var lit_bits [256]uint16
var literal_ratio uint
@ -523,21 +369,21 @@ func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []
/* Save the bit position of the MLEN field of the meta-block header, so that
we can update it later if we decide to extend this meta-block. */
storeMetaBlockHeader1(block_size, false, storage_ix, storage)
storeMetaBlockHeader1(block_size, false, bw)
/* No block splits, no contexts. */
writeBits(13, 0, storage_ix, storage)
bw.writeBits(13, 0)
literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], storage_ix, storage)
literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], bw)
{
/* Store the pre-compressed command and distance prefix codes. */
var i uint
for i = 0; i+7 < *cmd_code_numbits; i += 8 {
writeBits(8, uint64(cmd_code[i>>3]), storage_ix, storage)
bw.writeBits(8, uint64(cmd_code[i>>3]))
}
}
writeBits(*cmd_code_numbits&7, uint64(cmd_code[*cmd_code_numbits>>3]), storage_ix, storage)
bw.writeBits(*cmd_code_numbits&7, uint64(cmd_code[*cmd_code_numbits>>3]))
/* Initialize the command and distance histograms. We will gather
statistics of command and distance codes during the processing
@ -636,27 +482,27 @@ emit_commands:
var insert uint = uint(base - next_emit)
ip += int(matched)
if insert < 6210 {
emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
} else if shouldUseUncompressedMode(in[metablock_start:], in[next_emit:], insert, literal_ratio) {
emitUncompressedMetaBlock1(in[metablock_start:], in[base:], mlen_storage_ix-3, storage_ix, storage)
emitUncompressedMetaBlock1(in[metablock_start:base], mlen_storage_ix-3, bw)
input_size -= uint(base - input)
input = base
next_emit = input
goto next_block
} else {
emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
}
emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
if distance == last_distance {
writeBits(uint(cmd_depth[64]), uint64(cmd_bits[64]), storage_ix, storage)
bw.writeBits(uint(cmd_depth[64]), uint64(cmd_bits[64]))
cmd_histo[64]++
} else {
emitDistance1(uint(distance), cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
emitDistance1(uint(distance), cmd_depth, cmd_bits, cmd_histo[:], bw)
last_distance = distance
}
emitCopyLenLastDistance1(matched, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
emitCopyLenLastDistance1(matched, cmd_depth, cmd_bits, cmd_histo[:], bw)
next_emit = ip
if ip >= ip_limit {
@ -692,8 +538,8 @@ emit_commands:
}
ip += int(matched)
last_distance = int(base - candidate) /* > 0 */
emitCopyLen1(matched, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
emitDistance1(uint(last_distance), cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
emitCopyLen1(matched, cmd_depth, cmd_bits, cmd_histo[:], bw)
emitDistance1(uint(last_distance), cmd_depth, cmd_bits, cmd_histo[:], bw)
next_emit = ip
if ip >= ip_limit {
@ -739,7 +585,7 @@ emit_remainder:
nibbles. */
total_block_size += block_size
updateBits(20, uint32(total_block_size-1), mlen_storage_ix, storage)
bw.updateBits(20, uint32(total_block_size-1), mlen_storage_ix)
goto emit_commands
}
@ -747,13 +593,13 @@ emit_remainder:
if next_emit < ip_end {
var insert uint = uint(ip_end - next_emit)
if insert < 6210 {
emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
} else if shouldUseUncompressedMode(in[metablock_start:], in[next_emit:], insert, literal_ratio) {
emitUncompressedMetaBlock1(in[metablock_start:], in[ip_end:], mlen_storage_ix-3, storage_ix, storage)
emitUncompressedMetaBlock1(in[metablock_start:ip_end], mlen_storage_ix-3, bw)
} else {
emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
}
}
@ -769,30 +615,29 @@ next_block:
/* Save the bit position of the MLEN field of the meta-block header, so that
we can update it later if we decide to extend this meta-block. */
mlen_storage_ix = *storage_ix + 3
mlen_storage_ix = bw.getPos() + 3
storeMetaBlockHeader1(block_size, false, storage_ix, storage)
storeMetaBlockHeader1(block_size, false, bw)
/* No block splits, no contexts. */
writeBits(13, 0, storage_ix, storage)
bw.writeBits(13, 0)
literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], storage_ix, storage)
buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, storage_ix, storage)
literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], bw)
buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, bw)
goto emit_commands
}
if !is_last {
/* If this is not the last block, update the command and distance prefix
codes for the next block and store the compressed forms. */
cmd_code[0] = 0
*cmd_code_numbits = 0
buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, cmd_code_numbits, cmd_code)
var bw bitWriter
bw.dst = cmd_code
buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, &bw)
*cmd_code_numbits = bw.getPos()
}
}
/* Compresses "input" string to the "*storage" buffer as one or more complete
meta-blocks, and updates the "*storage_ix" bit position.
/* Compresses "input" string to bw as one or more complete meta-blocks.
If "is_last" is 1, emits an additional empty last meta-block.
@ -813,28 +658,28 @@ next_block:
REQUIRES: "table_size" is an odd (9, 11, 13, 15) power of two
OUTPUT: maximal copy distance <= |input_size|
OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18) */
func compressFragmentFast(input []byte, input_size uint, is_last bool, table []int, table_size uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, storage_ix *uint, storage []byte) {
var initial_storage_ix uint = *storage_ix
func compressFragmentFast(input []byte, input_size uint, is_last bool, table []int, table_size uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, bw *bitWriter) {
var initial_storage_ix uint = bw.getPos()
var table_bits uint = uint(log2FloorNonZero(table_size))
if input_size == 0 {
assert(is_last)
writeBits(1, 1, storage_ix, storage) /* islast */
writeBits(1, 1, storage_ix, storage) /* isempty */
*storage_ix = (*storage_ix + 7) &^ 7
bw.writeBits(1, 1) /* islast */
bw.writeBits(1, 1) /* isempty */
bw.jumpToByteBoundary()
return
}
compressFragmentFastImpl(input, input_size, is_last, table, table_bits, cmd_depth, cmd_bits, cmd_code_numbits, cmd_code, storage_ix, storage)
compressFragmentFastImpl(input, input_size, is_last, table, table_bits, cmd_depth, cmd_bits, cmd_code_numbits, cmd_code, bw)
/* If output is larger than single uncompressed block, rewrite it. */
if *storage_ix-initial_storage_ix > 31+(input_size<<3) {
emitUncompressedMetaBlock1(input, input[input_size:], initial_storage_ix, storage_ix, storage)
if bw.getPos()-initial_storage_ix > 31+(input_size<<3) {
emitUncompressedMetaBlock1(input[:input_size], initial_storage_ix, bw)
}
if is_last {
writeBits(1, 1, storage_ix, storage) /* islast */
writeBits(1, 1, storage_ix, storage) /* isempty */
*storage_ix = (*storage_ix + 7) &^ 7
bw.writeBits(1, 1) /* islast */
bw.writeBits(1, 1) /* isempty */
bw.jumpToByteBoundary()
}
}

@ -30,19 +30,18 @@ func hashBytesAtOffset(v uint64, offset uint, shift uint, length uint) uint32 {
}
func isMatch1(p1 []byte, p2 []byte, length uint) bool {
var i uint
for i = 0; i < length && i < 6; i++ {
if p1[i] != p2[i] {
if binary.LittleEndian.Uint32(p1) != binary.LittleEndian.Uint32(p2) {
return false
}
}
if length == 4 {
return true
}
return p1[4] == p2[4] && p1[5] == p2[5]
}
/* Builds a command and distance prefix code (each 64 symbols) into "depth" and
"bits" based on "histogram" and stores it into the bit stream. */
func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uint16, bw *bitWriter) {
var tree [129]huffmanTree
var cmd_depth = [numCommandSymbols]byte{0}
/* Tree size for building a tree over 64 symbols is 2 * 64 + 1. */
@ -88,10 +87,10 @@ func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uin
cmd_depth[448+8*i] = depth[16+i]
}
storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], storage_ix, storage)
storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], bw)
}
storeHuffmanTree(depth[64:], 64, tree[:], storage_ix, storage)
storeHuffmanTree(depth[64:], 64, tree[:], bw)
}
func emitInsertLen(insertlen uint32, commands *[]uint32) {
@ -198,11 +197,11 @@ func emitDistance(distance uint32, commands *[]uint32) {
}
/* REQUIRES: len <= 1 << 24. */
func storeMetaBlockHeader(len uint, is_uncompressed bool, storage_ix *uint, storage []byte) {
func storeMetaBlockHeader(len uint, is_uncompressed bool, bw *bitWriter) {
var nibbles uint = 6
/* ISLAST */
writeBits(1, 0, storage_ix, storage)
bw.writeBits(1, 0)
if len <= 1<<16 {
nibbles = 4
@ -210,11 +209,11 @@ func storeMetaBlockHeader(len uint, is_uncompressed bool, storage_ix *uint, stor
nibbles = 5
}
writeBits(2, uint64(nibbles)-4, storage_ix, storage)
writeBits(nibbles*4, uint64(len)-1, storage_ix, storage)
bw.writeBits(2, uint64(nibbles)-4)
bw.writeBits(nibbles*4, uint64(len)-1)
/* ISUNCOMPRESSED */
writeSingleBit(is_uncompressed, storage_ix, storage)
bw.writeSingleBit(is_uncompressed)
}
func createCommands(input []byte, block_size uint, input_size uint, base_ip_ptr []byte, table []int, table_bits uint, min_match uint, literals *[]byte, commands *[]uint32) {
@ -441,163 +440,20 @@ emit_remainder:
}
var storeCommands_kNumExtraBits = [128]uint32{
0,
0,
0,
0,
0,
0,
1,
1,
2,
2,
3,
3,
4,
4,
5,
5,
6,
7,
8,
9,
10,
12,
14,
24,
0,
0,
0,
0,
0,
0,
0,
0,
1,
1,
2,
2,
3,
3,
4,
4,
0,
0,
0,
0,
0,
0,
0,
0,
1,
1,
2,
2,
3,
3,
4,
4,
5,
5,
6,
7,
8,
9,
10,
24,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
1,
2,
2,
3,
3,
4,
4,
5,
5,
6,
6,
7,
7,
8,
8,
9,
9,
10,
10,
11,
11,
12,
12,
13,
13,
14,
14,
15,
15,
16,
16,
17,
17,
18,
18,
19,
19,
20,
20,
21,
21,
22,
22,
23,
23,
24,
24,
0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 12, 14, 24,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 24,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24,
}
var storeCommands_kInsertOffset = [24]uint32{
0,
1,
2,
3,
4,
5,
6,
8,
10,
14,
18,
26,
34,
50,
66,
98,
130,
194,
322,
578,
1090,
2114,
6210,
22594,
}
func storeCommands(literals []byte, num_literals uint, commands []uint32, num_commands uint, storage_ix *uint, storage []byte) {
0, 1, 2, 3, 4, 5, 6, 8, 10, 14, 18, 26, 34, 50, 66, 98, 130, 194, 322, 578,
1090, 2114, 6210, 22594,
}
func storeCommands(literals []byte, num_literals uint, commands []uint32, num_commands uint, bw *bitWriter) {
var lit_depths [256]byte
var lit_bits [256]uint16
var lit_histo = [256]uint32{0}
@ -610,7 +466,7 @@ func storeCommands(literals []byte, num_literals uint, commands []uint32, num_co
}
buildAndStoreHuffmanTreeFast(lit_histo[:], num_literals, /* max_bits = */
8, lit_depths[:], lit_bits[:], storage_ix, storage)
8, lit_depths[:], lit_bits[:], bw)
for i = 0; i < num_commands; i++ {
var code uint32 = commands[i] & 0xFF
@ -622,21 +478,21 @@ func storeCommands(literals []byte, num_literals uint, commands []uint32, num_co
cmd_histo[2] += 1
cmd_histo[64] += 1
cmd_histo[84] += 1
buildAndStoreCommandPrefixCode(cmd_histo[:], cmd_depths[:], cmd_bits[:], storage_ix, storage)
buildAndStoreCommandPrefixCode(cmd_histo[:], cmd_depths[:], cmd_bits[:], bw)
for i = 0; i < num_commands; i++ {
var cmd uint32 = commands[i]
var code uint32 = cmd & 0xFF
var extra uint32 = cmd >> 8
assert(code < 128)
writeBits(uint(cmd_depths[code]), uint64(cmd_bits[code]), storage_ix, storage)
writeBits(uint(storeCommands_kNumExtraBits[code]), uint64(extra), storage_ix, storage)
bw.writeBits(uint(cmd_depths[code]), uint64(cmd_bits[code]))
bw.writeBits(uint(storeCommands_kNumExtraBits[code]), uint64(extra))
if code < 24 {
var insert uint32 = storeCommands_kInsertOffset[code] + extra
var j uint32
for j = 0; j < insert; j++ {
var lit byte = literals[0]
writeBits(uint(lit_depths[lit]), uint64(lit_bits[lit]), storage_ix, storage)
bw.writeBits(uint(lit_depths[lit]), uint64(lit_bits[lit]))
literals = literals[1:]
}
}
@ -664,22 +520,13 @@ func shouldCompress(input []byte, input_size uint, num_literals uint) bool {
}
}
func rewindBitPosition(new_storage_ix uint, storage_ix *uint, storage []byte) {
var bitpos uint = new_storage_ix & 7
var mask uint = (1 << bitpos) - 1
storage[new_storage_ix>>3] &= byte(mask)
*storage_ix = new_storage_ix
}
func emitUncompressedMetaBlock(input []byte, input_size uint, storage_ix *uint, storage []byte) {
storeMetaBlockHeader(input_size, true, storage_ix, storage)
*storage_ix = (*storage_ix + 7) &^ 7
copy(storage[*storage_ix>>3:], input[:input_size])
*storage_ix += input_size << 3
storage[*storage_ix>>3] = 0
func emitUncompressedMetaBlock(input []byte, input_size uint, bw *bitWriter) {
storeMetaBlockHeader(input_size, true, bw)
bw.jumpToByteBoundary()
bw.writeBytes(input[:input_size])
}
func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_bits uint, min_match uint, storage_ix *uint, storage []byte) {
func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_bits uint, min_match uint, bw *bitWriter) {
/* Save the start of the first block for position and distance computations.
*/
var base_ip []byte = input
@ -693,17 +540,17 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
num_literals = uint(-cap(literals) + cap(literal_buf))
if shouldCompress(input, block_size, num_literals) {
var num_commands uint = uint(-cap(commands) + cap(command_buf))
storeMetaBlockHeader(block_size, false, storage_ix, storage)
storeMetaBlockHeader(block_size, false, bw)
/* No block splits, no contexts. */
writeBits(13, 0, storage_ix, storage)
bw.writeBits(13, 0)
storeCommands(literal_buf, num_literals, command_buf, num_commands, storage_ix, storage)
storeCommands(literal_buf, num_literals, command_buf, num_commands, bw)
} else {
/* Since we did not find many backward references and the entropy of
the data is close to 8 bits, we can simply emit an uncompressed block.
This makes compression speed of uncompressible data about 3x faster. */
emitUncompressedMetaBlock(input, block_size, storage_ix, storage)
emitUncompressedMetaBlock(input, block_size, bw)
}
input = input[block_size:]
@ -711,8 +558,7 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
}
}
/* Compresses "input" string to the "*storage" buffer as one or more complete
meta-blocks, and updates the "*storage_ix" bit position.
/* Compresses "input" string to bw as one or more complete meta-blocks.
If "is_last" is 1, emits an additional empty last meta-block.
@ -724,8 +570,8 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
REQUIRES: "table_size" is a power of two
OUTPUT: maximal copy distance <= |input_size|
OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18) */
func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_size uint, storage_ix *uint, storage []byte) {
var initial_storage_ix uint = *storage_ix
func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_size uint, bw *bitWriter) {
var initial_storage_ix uint = bw.getPos()
var table_bits uint = uint(log2FloorNonZero(table_size))
var min_match uint
if table_bits <= 15 {
@ -733,17 +579,17 @@ func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, comman
} else {
min_match = 6
}
compressFragmentTwoPassImpl(input, input_size, is_last, command_buf, literal_buf, table, table_bits, min_match, storage_ix, storage)
compressFragmentTwoPassImpl(input, input_size, is_last, command_buf, literal_buf, table, table_bits, min_match, bw)
/* If output is larger than single uncompressed block, rewrite it. */
if *storage_ix-initial_storage_ix > 31+(input_size<<3) {
rewindBitPosition(initial_storage_ix, storage_ix, storage)
emitUncompressedMetaBlock(input, input_size, storage_ix, storage)
if bw.getPos()-initial_storage_ix > 31+(input_size<<3) {
bw.rewind(initial_storage_ix)
emitUncompressedMetaBlock(input, input_size, bw)
}
if is_last {
writeBits(1, 1, storage_ix, storage) /* islast */
writeBits(1, 1, storage_ix, storage) /* isempty */
*storage_ix = (*storage_ix + 7) &^ 7
bw.writeBits(1, 1) /* islast */
bw.writeBits(1, 1) /* isempty */
bw.jumpToByteBoundary()
}
}

File diff suppressed because it is too large Load Diff

@ -24,7 +24,7 @@ func initHuffmanTree(self *huffmanTree, count uint32, left int16, right int16) {
}
/* Input size optimized Shell sort. */
type huffmanTreeComparator func(*huffmanTree, *huffmanTree) bool
type huffmanTreeComparator func(huffmanTree, huffmanTree) bool
var sortHuffmanTreeItems_gaps = []uint{132, 57, 23, 10, 4, 1}
@ -36,14 +36,13 @@ func sortHuffmanTreeItems(items []huffmanTree, n uint, comparator huffmanTreeCom
var tmp huffmanTree = items[i]
var k uint = i
var j uint = i - 1
for comparator(&tmp, &items[j]) {
for comparator(tmp, items[j]) {
items[k] = items[j]
k = j
tmp10 := j
j--
if tmp10 == 0 {
if j == 0 {
break
}
j--
}
items[k] = tmp
@ -63,7 +62,7 @@ func sortHuffmanTreeItems(items []huffmanTree, n uint, comparator huffmanTreeCom
for i = gap; i < n; i++ {
var j uint = i
var tmp huffmanTree = items[i]
for ; j >= gap && comparator(&tmp, &items[j-gap]); j -= gap {
for ; j >= gap && comparator(tmp, items[j-gap]); j -= gap {
items[j] = items[j-gap]
}
@ -105,7 +104,7 @@ func setDepth(p0 int, pool []huffmanTree, depth []byte, max_depth int) bool {
}
/* Sort the root nodes, least popular first. */
func sortHuffmanTree(v0 *huffmanTree, v1 *huffmanTree) bool {
func sortHuffmanTree(v0 huffmanTree, v1 huffmanTree) bool {
if v0.total_count_ != v1.total_count_ {
return v0.total_count_ < v1.total_count_
}

@ -778,8 +778,9 @@ var kStaticDistanceCodeDepth = [64]byte{
var kCodeLengthBits = [18]uint32{0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 15, 31, 0, 11, 7}
func storeStaticCodeLengthCode(storage_ix *uint, storage []byte) {
writeBits(40, 0x0000FF55555554, storage_ix, storage)
func storeStaticCodeLengthCode(bw *bitWriter) {
bw.writeBits(32, 0x55555554)
bw.writeBits(8, 0xFF)
}
var kZeroRepsBits = [numCommandSymbols]uint64{
@ -4317,9 +4318,10 @@ var kStaticCommandCodeBits = [numCommandSymbols]uint16{
2047,
}
func storeStaticCommandHuffmanTree(storage_ix *uint, storage []byte) {
writeBits(56, 0x92624416307003, storage_ix, storage)
writeBits(3, 0x00000000, storage_ix, storage)
func storeStaticCommandHuffmanTree(bw *bitWriter) {
bw.writeBits(32, 0x16307003)
bw.writeBits(24, 0x926244)
bw.writeBits(3, 0x00000000)
}
var kStaticDistanceCodeBits = [64]uint16{
@ -4389,6 +4391,6 @@ var kStaticDistanceCodeBits = [64]uint16{
63,
}
func storeStaticDistanceHuffmanTree(storage_ix *uint, storage []byte) {
writeBits(28, 0x0369DC03, storage_ix, storage)
func storeStaticDistanceHuffmanTree(bw *bitWriter) {
bw.writeBits(28, 0x0369DC03)
}

@ -1,5 +1,11 @@
package brotli
import (
"encoding/binary"
"math/bits"
"runtime"
)
/* Copyright 2010 Google Inc. All Rights Reserved.
Distributed under MIT license.
@ -9,6 +15,29 @@ package brotli
/* Function to find maximal matching prefixes of strings. */
func findMatchLengthWithLimit(s1 []byte, s2 []byte, limit uint) uint {
var matched uint = 0
_, _ = s1[limit-1], s2[limit-1] // bounds check
switch runtime.GOARCH {
case "amd64":
// Compare 8 bytes at at time.
for matched+8 <= limit {
w1 := binary.LittleEndian.Uint64(s1[matched:])
w2 := binary.LittleEndian.Uint64(s2[matched:])
if w1 != w2 {
return matched + uint(bits.TrailingZeros64(w1^w2)>>3)
}
matched += 8
}
case "386":
// Compare 4 bytes at at time.
for matched+4 <= limit {
w1 := binary.LittleEndian.Uint32(s1[matched:])
w2 := binary.LittleEndian.Uint32(s2[matched:])
if w1 != w2 {
return matched + uint(bits.TrailingZeros32(w1^w2)>>3)
}
matched += 4
}
}
for matched < limit && s1[matched] == s2[matched] {
matched++
}

@ -1,5 +1,3 @@
module github.com/andybalholm/brotli
go 1.12
require github.com/golang/gddo v0.0.0-20190419222130-af0f2af80721

@ -1,2 +0,0 @@
github.com/golang/gddo v0.0.0-20190419222130-af0f2af80721 h1:KRMr9A3qfbVM7iV/WcLY/rL5LICqwMHLhwRXKu99fXw=
github.com/golang/gddo v0.0.0-20190419222130-af0f2af80721/go.mod h1:xEhNfoBDX1hzLm2Nf80qUvZ2sVwoMZ8d6IE2SrsQfh4=

@ -163,7 +163,7 @@ func initBlockSplitIterator(self *blockSplitIterator, split *blockSplit) {
self.split_ = split
self.idx_ = 0
self.type_ = 0
if split.lengths != nil {
if len(split.lengths) > 0 {
self.length_ = uint(split.lengths[0])
} else {
self.length_ = 0
@ -180,17 +180,16 @@ func blockSplitIteratorNext(self *blockSplitIterator) {
self.length_--
}
func buildHistogramsWithContext(cmds []command, num_commands uint, literal_split *blockSplit, insert_and_copy_split *blockSplit, dist_split *blockSplit, ringbuffer []byte, start_pos uint, mask uint, prev_byte byte, prev_byte2 byte, context_modes []int, literal_histograms []histogramLiteral, insert_and_copy_histograms []histogramCommand, copy_dist_histograms []histogramDistance) {
func buildHistogramsWithContext(cmds []command, literal_split *blockSplit, insert_and_copy_split *blockSplit, dist_split *blockSplit, ringbuffer []byte, start_pos uint, mask uint, prev_byte byte, prev_byte2 byte, context_modes []int, literal_histograms []histogramLiteral, insert_and_copy_histograms []histogramCommand, copy_dist_histograms []histogramDistance) {
var pos uint = start_pos
var literal_it blockSplitIterator
var insert_and_copy_it blockSplitIterator
var dist_it blockSplitIterator
var i uint
initBlockSplitIterator(&literal_it, literal_split)
initBlockSplitIterator(&insert_and_copy_it, insert_and_copy_split)
initBlockSplitIterator(&dist_it, dist_split)
for i = 0; i < num_commands; i++ {
for i := range cmds {
var cmd *command = &cmds[i]
var j uint
blockSplitIteratorNext(&insert_and_copy_it)

@ -0,0 +1,192 @@
package brotli
import (
"compress/gzip"
"io"
"net/http"
"strings"
)
// HTTPCompressor chooses a compression method (brotli, gzip, or none) based on
// the Accept-Encoding header, sets the Content-Encoding header, and returns a
// WriteCloser that implements that compression. The Close method must be called
// before the current HTTP handler returns.
//
// Due to https://github.com/golang/go/issues/31753, the response will not be
// compressed unless you set a Content-Type header before you call
// HTTPCompressor.
func HTTPCompressor(w http.ResponseWriter, r *http.Request) io.WriteCloser {
if w.Header().Get("Content-Type") == "" {
return nopCloser{w}
}
if w.Header().Get("Vary") == "" {
w.Header().Set("Vary", "Accept-Encoding")
}
encoding := negotiateContentEncoding(r, []string{"br", "gzip"})
switch encoding {
case "br":
w.Header().Set("Content-Encoding", "br")
return NewWriter(w)
case "gzip":
w.Header().Set("Content-Encoding", "gzip")
return gzip.NewWriter(w)
}
return nopCloser{w}
}
// negotiateContentEncoding returns the best offered content encoding for the
// request's Accept-Encoding header. If two offers match with equal weight and
// then the offer earlier in the list is preferred. If no offers are
// acceptable, then "" is returned.
func negotiateContentEncoding(r *http.Request, offers []string) string {
bestOffer := "identity"
bestQ := -1.0
specs := parseAccept(r.Header, "Accept-Encoding")
for _, offer := range offers {
for _, spec := range specs {
if spec.Q > bestQ &&
(spec.Value == "*" || spec.Value == offer) {
bestQ = spec.Q
bestOffer = offer
}
}
}
if bestQ == 0 {
bestOffer = ""
}
return bestOffer
}
// acceptSpec describes an Accept* header.
type acceptSpec struct {
Value string
Q float64
}
// parseAccept parses Accept* headers.
func parseAccept(header http.Header, key string) (specs []acceptSpec) {
loop:
for _, s := range header[key] {
for {
var spec acceptSpec
spec.Value, s = expectTokenSlash(s)
if spec.Value == "" {
continue loop
}
spec.Q = 1.0
s = skipSpace(s)
if strings.HasPrefix(s, ";") {
s = skipSpace(s[1:])
if !strings.HasPrefix(s, "q=") {
continue loop
}
spec.Q, s = expectQuality(s[2:])
if spec.Q < 0.0 {
continue loop
}
}
specs = append(specs, spec)
s = skipSpace(s)
if !strings.HasPrefix(s, ",") {
continue loop
}
s = skipSpace(s[1:])
}
}
return
}
func skipSpace(s string) (rest string) {
i := 0
for ; i < len(s); i++ {
if octetTypes[s[i]]&isSpace == 0 {
break
}
}
return s[i:]
}
func expectTokenSlash(s string) (token, rest string) {
i := 0
for ; i < len(s); i++ {
b := s[i]
if (octetTypes[b]&isToken == 0) && b != '/' {
break
}
}
return s[:i], s[i:]
}
func expectQuality(s string) (q float64, rest string) {
switch {
case len(s) == 0:
return -1, ""
case s[0] == '0':
q = 0
case s[0] == '1':
q = 1
default:
return -1, ""
}
s = s[1:]
if !strings.HasPrefix(s, ".") {
return q, s
}
s = s[1:]
i := 0
n := 0
d := 1
for ; i < len(s); i++ {
b := s[i]
if b < '0' || b > '9' {
break
}
n = n*10 + int(b) - '0'
d *= 10
}
return q + float64(n)/float64(d), s[i:]
}
// Octet types from RFC 2616.
var octetTypes [256]octetType
type octetType byte
const (
isToken octetType = 1 << iota
isSpace
)
func init() {
// OCTET = <any 8-bit sequence of data>
// CHAR = <any US-ASCII character (octets 0 - 127)>
// CTL = <any US-ASCII control character (octets 0 - 31) and DEL (127)>
// CR = <US-ASCII CR, carriage return (13)>
// LF = <US-ASCII LF, linefeed (10)>
// SP = <US-ASCII SP, space (32)>
// HT = <US-ASCII HT, horizontal-tab (9)>
// <"> = <US-ASCII double-quote mark (34)>
// CRLF = CR LF
// LWS = [CRLF] 1*( SP | HT )
// TEXT = <any OCTET except CTLs, but including LWS>
// separators = "(" | ")" | "<" | ">" | "@" | "," | ";" | ":" | "\" | <">
// | "/" | "[" | "]" | "?" | "=" | "{" | "}" | SP | HT
// token = 1*<any CHAR except CTLs or separators>
// qdtext = <any TEXT except <">>
for c := 0; c < 256; c++ {
var t octetType
isCtl := c <= 31 || c == 127
isChar := 0 <= c && c <= 127
isSeparator := strings.IndexRune(" \t\"(),/:;<=>?@[]\\{}", rune(c)) >= 0
if strings.IndexRune(" \t\r\n", rune(c)) >= 0 {
t |= isSpace
}
if isChar && !isCtl && !isSeparator {
t |= isToken
}
octetTypes[c] = t
}
}

@ -23,12 +23,18 @@ func brotli_ensure_capacity_uint8_t(a *[]byte, c *uint, r uint) {
for new_size < r {
new_size *= 2
}
if cap(*a) < int(new_size) {
var new_array []byte = make([]byte, new_size)
if *c != 0 {
copy(new_array, (*a)[:*c])
}
*a = new_array
} else {
*a = (*a)[:new_size]
}
*c = new_size
}
}
@ -45,12 +51,16 @@ func brotli_ensure_capacity_uint32_t(a *[]uint32, c *uint, r uint) {
new_size *= 2
}
if cap(*a) < int(new_size) {
new_array = make([]uint32, new_size)
if *c != 0 {
copy(new_array, (*a)[:*c])
}
*a = new_array
} else {
*a = (*a)[:new_size]
}
*c = new_size
}
}

@ -1,5 +1,9 @@
package brotli
import (
"sync"
)
/* Copyright 2014 Google Inc. All Rights Reserved.
Distributed under MIT license.
@ -25,31 +29,30 @@ type metaBlockSplit struct {
distance_histograms_size uint
}
func initMetaBlockSplit(mb *metaBlockSplit) {
var metaBlockPool sync.Pool
func getMetaBlockSplit() *metaBlockSplit {
mb, _ := metaBlockPool.Get().(*metaBlockSplit)
if mb == nil {
mb = &metaBlockSplit{}
} else {
initBlockSplit(&mb.literal_split)
initBlockSplit(&mb.command_split)
initBlockSplit(&mb.distance_split)
mb.literal_context_map = nil
mb.literal_context_map = mb.literal_context_map[:0]
mb.literal_context_map_size = 0
mb.distance_context_map = nil
mb.distance_context_map = mb.distance_context_map[:0]
mb.distance_context_map_size = 0
mb.literal_histograms = nil
mb.literal_histograms_size = 0
mb.command_histograms = nil
mb.command_histograms_size = 0
mb.distance_histograms = nil
mb.distance_histograms_size = 0
mb.literal_histograms = mb.literal_histograms[:0]
mb.command_histograms = mb.command_histograms[:0]
mb.distance_histograms = mb.distance_histograms[:0]
}
return mb
}
func destroyMetaBlockSplit(mb *metaBlockSplit) {
destroyBlockSplit(&mb.literal_split)
destroyBlockSplit(&mb.command_split)
destroyBlockSplit(&mb.distance_split)
mb.literal_context_map = nil
mb.distance_context_map = nil
mb.literal_histograms = nil
mb.command_histograms = nil
mb.distance_histograms = nil
func freeMetaBlockSplit(mb *metaBlockSplit) {
metaBlockPool.Put(mb)
}
func initDistanceParams(params *encoderParams, npostfix uint32, ndirect uint32) {
@ -84,14 +87,12 @@ func initDistanceParams(params *encoderParams, npostfix uint32, ndirect uint32)
dist_params.max_distance = uint(max_distance)
}
func recomputeDistancePrefixes(cmds []command, num_commands uint, orig_params *distanceParams, new_params *distanceParams) {
var i uint
func recomputeDistancePrefixes(cmds []command, orig_params *distanceParams, new_params *distanceParams) {
if orig_params.distance_postfix_bits == new_params.distance_postfix_bits && orig_params.num_direct_distance_codes == new_params.num_direct_distance_codes {
return
}
for i = 0; i < num_commands; i++ {
for i := range cmds {
var cmd *command = &cmds[i]
if commandCopyLen(cmd) != 0 && cmd.cmd_prefix_ >= 128 {
prefixEncodeCopyDistance(uint(commandRestoreDistanceCode(cmd, orig_params)), uint(new_params.num_direct_distance_codes), uint(new_params.distance_postfix_bits), &cmd.dist_prefix_, &cmd.dist_extra_)
@ -99,8 +100,7 @@ func recomputeDistancePrefixes(cmds []command, num_commands uint, orig_params *d
}
}
func computeDistanceCost(cmds []command, num_commands uint, orig_params *distanceParams, new_params *distanceParams, cost *float64) bool {
var i uint
func computeDistanceCost(cmds []command, orig_params *distanceParams, new_params *distanceParams, cost *float64) bool {
var equal_params bool = false
var dist_prefix uint16
var dist_extra uint32
@ -112,8 +112,8 @@ func computeDistanceCost(cmds []command, num_commands uint, orig_params *distanc
equal_params = true
}
for i = 0; i < num_commands; i++ {
var cmd *command = &cmds[i]
for i := range cmds {
cmd := &cmds[i]
if commandCopyLen(cmd) != 0 && cmd.cmd_prefix_ >= 128 {
if equal_params {
dist_prefix = cmd.dist_prefix_
@ -137,7 +137,7 @@ func computeDistanceCost(cmds []command, num_commands uint, orig_params *distanc
var buildMetaBlock_kMaxNumberOfHistograms uint = 256
func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParams, prev_byte byte, prev_byte2 byte, cmds []command, num_commands uint, literal_context_mode int, mb *metaBlockSplit) {
func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParams, prev_byte byte, prev_byte2 byte, cmds []command, literal_context_mode int, mb *metaBlockSplit) {
var distance_histograms []histogramDistance
var literal_histograms []histogramLiteral
var literal_context_modes []int = nil
@ -164,7 +164,7 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
check_orig = false
}
skip = !computeDistanceCost(cmds, num_commands, &orig_params.dist, &new_params.dist, &dist_cost)
skip = !computeDistanceCost(cmds, &orig_params.dist, &new_params.dist, &dist_cost)
if skip || (dist_cost > best_dist_cost) {
break
}
@ -181,7 +181,7 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
if check_orig {
var dist_cost float64
computeDistanceCost(cmds, num_commands, &orig_params.dist, &orig_params.dist, &dist_cost)
computeDistanceCost(cmds, &orig_params.dist, &orig_params.dist, &dist_cost)
if dist_cost < best_dist_cost {
/* NB: currently unused; uncomment when more param tuning is added. */
/* best_dist_cost = dist_cost; */
@ -189,9 +189,9 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
}
}
recomputeDistancePrefixes(cmds, num_commands, &orig_params.dist, &params.dist)
recomputeDistancePrefixes(cmds, &orig_params.dist, &params.dist)
splitBlock(cmds, num_commands, ringbuffer, pos, mask, params, &mb.literal_split, &mb.command_split, &mb.distance_split)
splitBlock(cmds, ringbuffer, pos, mask, params, &mb.literal_split, &mb.command_split, &mb.distance_split)
if !params.disable_literal_context_modeling {
literal_context_multiplier = 1 << literalContextBits
@ -209,21 +209,30 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
distance_histograms = make([]histogramDistance, distance_histograms_size)
clearHistogramsDistance(distance_histograms, distance_histograms_size)
assert(mb.command_histograms == nil)
mb.command_histograms_size = mb.command_split.num_types
if cap(mb.command_histograms) < int(mb.command_histograms_size) {
mb.command_histograms = make([]histogramCommand, (mb.command_histograms_size))
} else {
mb.command_histograms = mb.command_histograms[:mb.command_histograms_size]
}
clearHistogramsCommand(mb.command_histograms, mb.command_histograms_size)
buildHistogramsWithContext(cmds, num_commands, &mb.literal_split, &mb.command_split, &mb.distance_split, ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_modes, literal_histograms, mb.command_histograms, distance_histograms)
buildHistogramsWithContext(cmds, &mb.literal_split, &mb.command_split, &mb.distance_split, ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_modes, literal_histograms, mb.command_histograms, distance_histograms)
literal_context_modes = nil
assert(mb.literal_context_map == nil)
mb.literal_context_map_size = mb.literal_split.num_types << literalContextBits
if cap(mb.literal_context_map) < int(mb.literal_context_map_size) {
mb.literal_context_map = make([]uint32, (mb.literal_context_map_size))
} else {
mb.literal_context_map = mb.literal_context_map[:mb.literal_context_map_size]
}
assert(mb.literal_histograms == nil)
mb.literal_histograms_size = mb.literal_context_map_size
if cap(mb.literal_histograms) < int(mb.literal_histograms_size) {
mb.literal_histograms = make([]histogramLiteral, (mb.literal_histograms_size))
} else {
mb.literal_histograms = mb.literal_histograms[:mb.literal_histograms_size]
}
clusterHistogramsLiteral(literal_histograms, literal_histograms_size, buildMetaBlock_kMaxNumberOfHistograms, mb.literal_histograms, &mb.literal_histograms_size, mb.literal_context_map)
literal_histograms = nil
@ -239,13 +248,19 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
}
}
assert(mb.distance_context_map == nil)
mb.distance_context_map_size = mb.distance_split.num_types << distanceContextBits
if cap(mb.distance_context_map) < int(mb.distance_context_map_size) {
mb.distance_context_map = make([]uint32, (mb.distance_context_map_size))
} else {
mb.distance_context_map = mb.distance_context_map[:mb.distance_context_map_size]
}
assert(mb.distance_histograms == nil)
mb.distance_histograms_size = mb.distance_context_map_size
if cap(mb.distance_histograms) < int(mb.distance_histograms_size) {
mb.distance_histograms = make([]histogramDistance, (mb.distance_histograms_size))
} else {
mb.distance_histograms = mb.distance_histograms[:mb.distance_histograms_size]
}
clusterHistogramsDistance(distance_histograms, mb.distance_context_map_size, buildMetaBlock_kMaxNumberOfHistograms, mb.distance_histograms, &mb.distance_histograms_size, mb.distance_context_map)
distance_histograms = nil
@ -298,9 +313,12 @@ func initContextBlockSplitter(self *contextBlockSplitter, alphabet_size uint, nu
brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, max_num_blocks)
brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, max_num_blocks)
split.num_blocks = max_num_blocks
assert(*histograms == nil)
*histograms_size = max_num_types * num_contexts
if histograms == nil || cap(*histograms) < int(*histograms_size) {
*histograms = make([]histogramLiteral, (*histograms_size))
} else {
*histograms = (*histograms)[:*histograms_size]
}
self.histograms_ = *histograms
/* Clear only current histogram. */
@ -453,9 +471,12 @@ func contextBlockSplitterAddSymbol(self *contextBlockSplitter, symbol uint, cont
func mapStaticContexts(num_contexts uint, static_context_map []uint32, mb *metaBlockSplit) {
var i uint
assert(mb.literal_context_map == nil)
mb.literal_context_map_size = mb.literal_split.num_types << literalContextBits
if cap(mb.literal_context_map) < int(mb.literal_context_map_size) {
mb.literal_context_map = make([]uint32, (mb.literal_context_map_size))
} else {
mb.literal_context_map = mb.literal_context_map[:mb.literal_context_map_size]
}
for i = 0; i < mb.literal_split.num_types; i++ {
var offset uint32 = uint32(i * num_contexts)
@ -466,7 +487,7 @@ func mapStaticContexts(num_contexts uint, static_context_map []uint32, mb *metaB
}
}
func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_byte byte, prev_byte2 byte, literal_context_lut contextLUT, num_contexts uint, static_context_map []uint32, commands []command, n_commands uint, mb *metaBlockSplit) {
func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_byte byte, prev_byte2 byte, literal_context_lut contextLUT, num_contexts uint, static_context_map []uint32, commands []command, mb *metaBlockSplit) {
var lit_blocks struct {
plain blockSplitterLiteral
ctx contextBlockSplitter
@ -474,8 +495,7 @@ func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_b
var cmd_blocks blockSplitterCommand
var dist_blocks blockSplitterDistance
var num_literals uint = 0
var i uint
for i = 0; i < n_commands; i++ {
for i := range commands {
num_literals += uint(commands[i].insert_len_)
}
@ -485,11 +505,10 @@ func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_b
initContextBlockSplitter(&lit_blocks.ctx, 256, num_contexts, 512, 400.0, num_literals, &mb.literal_split, &mb.literal_histograms, &mb.literal_histograms_size)
}
initBlockSplitterCommand(&cmd_blocks, numCommandSymbols, 1024, 500.0, n_commands, &mb.command_split, &mb.command_histograms, &mb.command_histograms_size)
initBlockSplitterDistance(&dist_blocks, 64, 512, 100.0, n_commands, &mb.distance_split, &mb.distance_histograms, &mb.distance_histograms_size)
initBlockSplitterCommand(&cmd_blocks, numCommandSymbols, 1024, 500.0, uint(len(commands)), &mb.command_split, &mb.command_histograms, &mb.command_histograms_size)
initBlockSplitterDistance(&dist_blocks, 64, 512, 100.0, uint(len(commands)), &mb.distance_split, &mb.distance_histograms, &mb.distance_histograms_size)
for i = 0; i < n_commands; i++ {
var cmd command = commands[i]
for _, cmd := range commands {
var j uint
blockSplitterAddSymbolCommand(&cmd_blocks, uint(cmd.cmd_prefix_))
for j = uint(cmd.insert_len_); j != 0; j-- {
@ -530,11 +549,11 @@ func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_b
}
}
func buildMetaBlockGreedy(ringbuffer []byte, pos uint, mask uint, prev_byte byte, prev_byte2 byte, literal_context_lut contextLUT, num_contexts uint, static_context_map []uint32, commands []command, n_commands uint, mb *metaBlockSplit) {
func buildMetaBlockGreedy(ringbuffer []byte, pos uint, mask uint, prev_byte byte, prev_byte2 byte, literal_context_lut contextLUT, num_contexts uint, static_context_map []uint32, commands []command, mb *metaBlockSplit) {
if num_contexts == 1 {
buildMetaBlockGreedyInternal(ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_lut, 1, nil, commands, n_commands, mb)
buildMetaBlockGreedyInternal(ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_lut, 1, nil, commands, mb)
} else {
buildMetaBlockGreedyInternal(ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_lut, num_contexts, static_context_map, commands, n_commands, mb)
buildMetaBlockGreedyInternal(ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_lut, num_contexts, static_context_map, commands, mb)
}
}

@ -43,9 +43,12 @@ func initBlockSplitterCommand(self *blockSplitterCommand, alphabet_size uint, mi
brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, max_num_blocks)
brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, max_num_blocks)
self.split_.num_blocks = max_num_blocks
assert(*histograms == nil)
*histograms_size = max_num_types
if histograms == nil || cap(*histograms) < int(*histograms_size) {
*histograms = make([]histogramCommand, (*histograms_size))
} else {
*histograms = (*histograms)[:*histograms_size]
}
self.histograms_ = *histograms
/* Clear only current histogram. */

@ -43,9 +43,12 @@ func initBlockSplitterDistance(self *blockSplitterDistance, alphabet_size uint,
brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, max_num_blocks)
brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, max_num_blocks)
self.split_.num_blocks = max_num_blocks
assert(*histograms == nil)
*histograms_size = max_num_types
*histograms = make([]histogramDistance, (*histograms_size))
if histograms == nil || cap(*histograms) < int(*histograms_size) {
*histograms = make([]histogramDistance, *histograms_size)
} else {
*histograms = (*histograms)[:*histograms_size]
}
self.histograms_ = *histograms
/* Clear only current histogram. */

@ -43,9 +43,12 @@ func initBlockSplitterLiteral(self *blockSplitterLiteral, alphabet_size uint, mi
brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, max_num_blocks)
brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, max_num_blocks)
self.split_.num_blocks = max_num_blocks
assert(*histograms == nil)
*histograms_size = max_num_types
*histograms = make([]histogramLiteral, (*histograms_size))
if histograms == nil || cap(*histograms) < int(*histograms_size) {
*histograms = make([]histogramLiteral, *histograms_size)
} else {
*histograms = (*histograms)[:*histograms_size]
}
self.histograms_ = *histograms
/* Clear only current histogram. */

@ -27,10 +27,7 @@ type ringBuffer struct {
}
func ringBufferInit(rb *ringBuffer) {
rb.cur_size_ = 0
rb.pos_ = 0
rb.data_ = nil
rb.buffer_ = nil
}
func ringBufferSetup(params *encoderParams, rb *ringBuffer) {
@ -47,11 +44,16 @@ const kSlackForEightByteHashingEverywhere uint = 7
/* Allocates or re-allocates data_ to the given length + plus some slack
region before and after. Fills the slack regions with zeros. */
func ringBufferInitBuffer(buflen uint32, rb *ringBuffer) {
var new_data []byte = make([]byte, (2 + uint(buflen) + kSlackForEightByteHashingEverywhere))
var new_data []byte
var i uint
size := 2 + int(buflen) + int(kSlackForEightByteHashingEverywhere)
if cap(rb.data_) < size {
new_data = make([]byte, size)
} else {
new_data = rb.data_[:size]
}
if rb.data_ != nil {
copy(new_data, rb.data_[:2+rb.cur_size_+uint32(kSlackForEightByteHashingEverywhere)])
rb.data_ = nil
}
rb.data_ = new_data

@ -8,49 +8,87 @@ package brotli
/* Write bits into a byte array. */
/* This function writes bits into bytes in increasing addresses, and within
a byte least-significant-bit first.
type bitWriter struct {
dst []byte
The function can write up to 56 bits in one go with WriteBits
Example: let's assume that 3 bits (Rs below) have been written already:
BYTE-0 BYTE+1 BYTE+2
0000 0RRR 0000 0000 0000 0000
// Data waiting to be written is the low nbits of bits.
bits uint64
nbits uint
}
Now, we could write 5 or less bits in MSB by just sifting by 3
and OR'ing to BYTE-0.
func (w *bitWriter) writeBits(nb uint, b uint64) {
w.bits |= b << w.nbits
w.nbits += nb
if w.nbits >= 32 {
bits := w.bits
w.bits >>= 32
w.nbits -= 32
w.dst = append(w.dst,
byte(bits),
byte(bits>>8),
byte(bits>>16),
byte(bits>>24),
)
}
}
For n bits, we take the last 5 bits, OR that with high bits in BYTE-0,
and locate the rest in BYTE+1, BYTE+2, etc. */
func writeBits(n_bits uint, bits uint64, pos *uint, array []byte) {
var array_pos []byte = array[*pos>>3:]
var bits_reserved_in_first_byte uint = (*pos & 7)
/* implicit & 0xFF is assumed for uint8_t arithmetics */
func (w *bitWriter) writeSingleBit(bit bool) {
if bit {
w.writeBits(1, 1)
} else {
w.writeBits(1, 0)
}
}
var bits_left_to_write uint
bits <<= bits_reserved_in_first_byte
array_pos[0] |= byte(bits)
array_pos = array_pos[1:]
for bits_left_to_write = n_bits + bits_reserved_in_first_byte; bits_left_to_write >= 9; bits_left_to_write -= 8 {
bits >>= 8
array_pos[0] = byte(bits)
array_pos = array_pos[1:]
func (w *bitWriter) jumpToByteBoundary() {
dst := w.dst
for w.nbits != 0 {
dst = append(dst, byte(w.bits))
w.bits >>= 8
if w.nbits > 8 { // Avoid underflow
w.nbits -= 8
} else {
w.nbits = 0
}
}
w.bits = 0
w.dst = dst
}
array_pos[0] = 0
*pos += n_bits
func (w *bitWriter) writeBytes(b []byte) {
if w.nbits&7 != 0 {
panic("writeBytes with unfinished bits")
}
for w.nbits != 0 {
w.dst = append(w.dst, byte(w.bits))
w.bits >>= 8
w.nbits -= 8
}
w.dst = append(w.dst, b...)
}
func writeSingleBit(bit bool, pos *uint, array []byte) {
if bit {
writeBits(1, 1, pos, array)
} else {
writeBits(1, 0, pos, array)
func (w *bitWriter) getPos() uint {
return uint(len(w.dst)<<3) + w.nbits
}
func (w *bitWriter) rewind(p uint) {
w.bits = uint64(w.dst[p>>3] & byte((1<<(p&7))-1))
w.nbits = p & 7
w.dst = w.dst[:p>>3]
}
func writeBitsPrepareStorage(pos uint, array []byte) {
assert(pos&7 == 0)
array[pos>>3] = 0
func (w *bitWriter) updateBits(n_bits uint, bits uint32, pos uint) {
for n_bits > 0 {
var byte_pos uint = pos >> 3
var n_unchanged_bits uint = pos & 7
var n_changed_bits uint = brotli_min_size_t(n_bits, 8-n_unchanged_bits)
var total_bits uint = n_unchanged_bits + n_changed_bits
var mask uint32 = (^((1 << total_bits) - 1)) | ((1 << n_unchanged_bits) - 1)
var unchanged_bits uint32 = uint32(w.dst[byte_pos]) & mask
var changed_bits uint32 = bits & ((1 << n_changed_bits) - 1)
w.dst[byte_pos] = byte(changed_bits<<n_unchanged_bits | unchanged_bits)
n_bits -= n_changed_bits
bits >>= n_changed_bits
pos += n_changed_bits
}
}

@ -1,12 +1,8 @@
package brotli
import (
"compress/gzip"
"errors"
"io"
"net/http"
"github.com/golang/gddo/httputil"
)
const (
@ -50,11 +46,8 @@ func NewWriterLevel(dst io.Writer, level int) *Writer {
// NewWriterOptions is like NewWriter but specifies WriterOptions
func NewWriterOptions(dst io.Writer, options WriterOptions) *Writer {
w := new(Writer)
w.options = options
w.Reset(dst)
w.params.quality = options.Quality
if options.LGWin > 0 {
w.params.lgwin = uint(options.LGWin)
}
return w
}
@ -63,6 +56,10 @@ func NewWriterOptions(dst io.Writer, options WriterOptions) *Writer {
// instead. This permits reusing a Writer rather than allocating a new one.
func (w *Writer) Reset(dst io.Writer) {
encoderInitState(w)
w.params.quality = w.options.Quality
if w.options.LGWin > 0 {
w.params.lgwin = uint(w.options.LGWin)
}
w.dst = dst
}
@ -70,6 +67,9 @@ func (w *Writer) writeChunk(p []byte, op int) (n int, err error) {
if w.dst == nil {
return 0, errWriterClosed
}
if w.err != nil {
return 0, w.err
}
for {
availableIn := uint(len(p))
@ -82,16 +82,8 @@ func (w *Writer) writeChunk(p []byte, op int) (n int, err error) {
return n, errEncode
}
outputData := encoderTakeOutput(w)
if len(outputData) > 0 {
_, err = w.dst.Write(outputData)
if err != nil {
return n, err
}
}
if len(p) == 0 {
return n, nil
if len(p) == 0 || w.err != nil {
return n, w.err
}
}
}
@ -124,32 +116,3 @@ type nopCloser struct {
}
func (nopCloser) Close() error { return nil }
// HTTPCompressor chooses a compression method (brotli, gzip, or none) based on
// the Accept-Encoding header, sets the Content-Encoding header, and returns a
// WriteCloser that implements that compression. The Close method must be called
// before the current HTTP handler returns.
//
// Due to https://github.com/golang/go/issues/31753, the response will not be
// compressed unless you set a Content-Type header before you call
// HTTPCompressor.
func HTTPCompressor(w http.ResponseWriter, r *http.Request) io.WriteCloser {
if w.Header().Get("Content-Type") == "" {
return nopCloser{w}
}
if w.Header().Get("Vary") == "" {
w.Header().Set("Vary", "Accept-Encoding")
}
encoding := httputil.NegotiateContentEncoding(r, []string{"br", "gzip"})
switch encoding {
case "br":
w.Header().Set("Content-Encoding", "br")
return NewWriter(w)
case "gzip":
w.Header().Set("Content-Encoding", "gzip")
return gzip.NewWriter(w)
}
return nopCloser{w}
}

@ -31,6 +31,13 @@ func main() {
}
}
```
output
cp
-Rdp
file name
file name2
dir name
## Documentation

@ -0,0 +1,3 @@
module github.com/anmitsu/go-shlex
go 1.13

@ -43,9 +43,16 @@ type configuration struct {
}
func (c *configuration) SetAnalysisQueueSize(n int) {
if c.analysisQueue != nil {
c.analysisQueue.Close()
}
c.analysisQueue = index.NewAnalysisQueue(n)
}
func (c *configuration) Shutdown() {
c.SetAnalysisQueueSize(0)
}
func newConfiguration() *configuration {
return &configuration{
Cache: registry.NewCache(),

@ -8,10 +8,11 @@ require (
github.com/blevesearch/go-porterstemmer v1.0.3
github.com/blevesearch/segment v0.9.0
github.com/blevesearch/snowballstem v0.9.0
github.com/blevesearch/zap/v11 v11.0.10
github.com/blevesearch/zap/v12 v12.0.10
github.com/blevesearch/zap/v13 v13.0.2
github.com/blevesearch/zap/v14 v14.0.1
github.com/blevesearch/zap/v11 v11.0.12
github.com/blevesearch/zap/v12 v12.0.12
github.com/blevesearch/zap/v13 v13.0.4
github.com/blevesearch/zap/v14 v14.0.3
github.com/blevesearch/zap/v15 v15.0.1
github.com/couchbase/moss v0.1.0
github.com/couchbase/vellum v1.0.2
github.com/golang/protobuf v1.3.2

@ -515,21 +515,17 @@ func (s *Scorch) diskFileStats(rootSegmentPaths map[string]struct{}) (uint64,
return numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot
}
func (s *Scorch) rootDiskSegmentsPaths() map[string]struct{} {
rv := make(map[string]struct{}, len(s.root.segment))
for _, segmentSnapshot := range s.root.segment {
if seg, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok {
rv[seg.Path()] = struct{}{}
}
}
return rv
}
func (s *Scorch) StatsMap() map[string]interface{} {
m := s.stats.ToMap()
indexSnapshot := s.currentSnapshot()
defer func() {
_ = indexSnapshot.Close()
}()
rootSegPaths := indexSnapshot.diskSegmentsPaths()
s.rootLock.RLock()
rootSegPaths := s.rootDiskSegmentsPaths()
m["CurFilesIneligibleForRemoval"] = uint64(len(s.ineligibleForRemoval))
s.rootLock.RUnlock()
@ -556,6 +552,10 @@ func (s *Scorch) StatsMap() map[string]interface{} {
m["num_bytes_used_disk"] = numBytesUsedDisk
// total disk bytes by the latest root index, exclusive of older snapshots
m["num_bytes_used_disk_by_root"] = numBytesOnDiskByRoot
// num_bytes_used_disk_by_root_reclaimable is an approximation about the
// reclaimable disk space in an index. (eg: from a full compaction)
m["num_bytes_used_disk_by_root_reclaimable"] = uint64(float64(numBytesOnDiskByRoot) *
indexSnapshot.reClaimableDocsRatio())
m["num_files_on_disk"] = numFilesOnDisk
m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"]
m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"]

@ -23,6 +23,7 @@ import (
zapv12 "github.com/blevesearch/zap/v12"
zapv13 "github.com/blevesearch/zap/v13"
zapv14 "github.com/blevesearch/zap/v14"
zapv15 "github.com/blevesearch/zap/v15"
)
var supportedSegmentPlugins map[string]map[uint32]segment.Plugin
@ -30,6 +31,7 @@ var defaultSegmentPlugin segment.Plugin
func init() {
ResetPlugins()
RegisterPlugin(zapv15.Plugin(), false)
RegisterPlugin(zapv14.Plugin(), false)
RegisterPlugin(zapv13.Plugin(), false)
RegisterPlugin(zapv12.Plugin(), false)

@ -708,6 +708,33 @@ func (i *IndexSnapshot) DumpFields() chan interface{} {
return rv
}
func (i *IndexSnapshot) diskSegmentsPaths() map[string]struct{} {
rv := make(map[string]struct{}, len(i.segment))
for _, segmentSnapshot := range i.segment {
if seg, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok {
rv[seg.Path()] = struct{}{}
}
}
return rv
}
// reClaimableDocsRatio gives a ratio about the obsoleted or
// reclaimable documents present in a given index snapshot.
func (i *IndexSnapshot) reClaimableDocsRatio() float64 {
var totalCount, liveCount uint64
for _, segmentSnapshot := range i.segment {
if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok {
totalCount += uint64(segmentSnapshot.FullSize())
liveCount += uint64(segmentSnapshot.Count())
}
}
if totalCount > 0 {
return float64(totalCount-liveCount) / float64(totalCount)
}
return 0
}
// subtractStrings returns set a minus elements of set b.
func subtractStrings(a, b []string) []string {
if len(b) == 0 {

@ -112,6 +112,11 @@ func newIndexUsing(path string, mapping mapping.IndexMapping, indexType string,
}
return nil, err
}
defer func(rv *indexImpl) {
if !rv.open {
rv.i.Close()
}
}(&rv)
// now persist the mapping
mappingBytes, err := json.Marshal(mapping)
@ -177,6 +182,11 @@ func openIndexUsing(path string, runtimeConfig map[string]interface{}) (rv *inde
}
return nil, err
}
defer func(rv *indexImpl) {
if !rv.open {
rv.i.Close()
}
}(rv)
// now load the mapping
indexReader, err := rv.i.Reader()

@ -15,6 +15,8 @@
package html
import (
"html"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/search/highlight"
)
@ -54,18 +56,18 @@ func (a *FragmentFormatter) Format(f *highlight.Fragment, orderedTermLocations h
break
}
// add the stuff before this location
rv += string(f.Orig[curr:termLocation.Start])
// add the color
rv += html.EscapeString(string(f.Orig[curr:termLocation.Start]))
// start the <mark> tag
rv += a.before
// add the term itself
rv += string(f.Orig[termLocation.Start:termLocation.End])
// reset the color
// end the <mark> tag
rv += a.after
// update current
curr = termLocation.End
}
// add any remaining text after the last token
rv += string(f.Orig[curr:f.End])
rv += html.EscapeString(string(f.Orig[curr:f.End]))
return rv
}

@ -4,7 +4,7 @@ go 1.12
require (
github.com/RoaringBitmap/roaring v0.4.23
github.com/blevesearch/bleve v1.0.10
github.com/blevesearch/bleve v1.0.12
github.com/blevesearch/mmap-go v1.0.2
github.com/couchbase/vellum v1.0.2
github.com/golang/snappy v0.0.1

@ -4,7 +4,7 @@ go 1.12
require (
github.com/RoaringBitmap/roaring v0.4.23
github.com/blevesearch/bleve v1.0.10
github.com/blevesearch/bleve v1.0.12
github.com/blevesearch/mmap-go v1.0.2
github.com/couchbase/vellum v1.0.2
github.com/golang/snappy v0.0.1

@ -4,7 +4,7 @@ go 1.12
require (
github.com/RoaringBitmap/roaring v0.4.23
github.com/blevesearch/bleve v1.0.10
github.com/blevesearch/bleve v1.0.12
github.com/blevesearch/mmap-go v1.0.2
github.com/couchbase/vellum v1.0.2
github.com/golang/snappy v0.0.1

@ -4,7 +4,7 @@ go 1.12
require (
github.com/RoaringBitmap/roaring v0.4.23
github.com/blevesearch/bleve v1.0.10
github.com/blevesearch/bleve v1.0.12
github.com/blevesearch/mmap-go v1.0.2
github.com/couchbase/vellum v1.0.2
github.com/golang/snappy v0.0.1

@ -0,0 +1,12 @@
#*
*.sublime-*
*~
.#*
.project
.settings
**/.idea/
**/*.iml
.DS_Store
/cmd/zap/zap
*.test
tags

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

@ -0,0 +1,158 @@
# zap file format
Advanced ZAP File Format Documentation is [here](zap.md).
The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written.
Current usage:
- mmap the entire file
- crc-32 bytes and version are in fixed position at end of the file
- reading remainder of footer could be version specific
- remainder of footer gives us:
- 3 important offsets (docValue , fields index and stored data index)
- 2 important values (number of docs and chunk factor)
- field data is processed once and memoized onto the heap so that we never have to go back to disk for it
- access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends.
- access to all other indexed data follows the following pattern:
- first know the field name -> convert to id
- next navigate to term dictionary for that field
- some operations stop here and do dictionary ops
- next use dictionary to navigate to posting list for a specific term
- walk posting list
- if necessary, walk posting details as we go
- if location info is desired, consult location bitmap to see if it is there
## stored fields section
- for each document
- preparation phase:
- produce a slice of metadata bytes and data bytes
- produce these slices in field id order
- field value is appended to the data slice
- metadata slice is varint encoded with the following values for each field value
- field id (uint16)
- field type (byte)
- field value start offset in uncompressed data slice (uint64)
- field value length (uint64)
- field number of array positions (uint64)
- one additional value for each array position (uint64)
- compress the data slice using snappy
- file writing phase:
- remember the start offset for this document
- write out meta data length (varint uint64)
- write out compressed data length (varint uint64)
- write out the metadata bytes
- write out the compressed data bytes
## stored fields idx
- for each document
- write start offset (remembered from previous section) of stored data (big endian uint64)
With this index and a known document number, we have direct access to all the stored field data.
## posting details (freq/norm) section
- for each posting list
- produce a slice containing multiple consecutive chunks (each chunk is varint stream)
- produce a slice remembering offsets of where each chunk starts
- preparation phase:
- for each hit in the posting list
- if this hit is in next chunk close out encoding of last chunk and record offset start of next
- encode term frequency (uint64)
- encode norm factor (float32)
- file writing phase:
- remember start position for this posting list details
- write out number of chunks that follow (varint uint64)
- write out length of each chunk (each a varint uint64)
- write out the byte slice containing all the chunk data
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
## posting details (location) section
- for each posting list
- produce a slice containing multiple consecutive chunks (each chunk is varint stream)
- produce a slice remembering offsets of where each chunk starts
- preparation phase:
- for each hit in the posting list
- if this hit is in next chunk close out encoding of last chunk and record offset start of next
- encode field (uint16)
- encode field pos (uint64)
- encode field start (uint64)
- encode field end (uint64)
- encode number of array positions to follow (uint64)
- encode each array position (each uint64)
- file writing phase:
- remember start position for this posting list details
- write out number of chunks that follow (varint uint64)
- write out length of each chunk (each a varint uint64)
- write out the byte slice containing all the chunk data
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
## postings list section
- for each posting list
- preparation phase:
- encode roaring bitmap posting list to bytes (so we know the length)
- file writing phase:
- remember the start position for this posting list
- write freq/norm details offset (remembered from previous, as varint uint64)
- write location details offset (remembered from previous, as varint uint64)
- write length of encoded roaring bitmap
- write the serialized roaring bitmap data
## dictionary
- for each field
- preparation phase:
- encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous)
- file writing phase:
- remember the start position of this persistDictionary
- write length of vellum data (varint uint64)
- write out vellum data
## fields section
- for each field
- file writing phase:
- remember start offset for each field
- write dictionary address (remembered from previous) (varint uint64)
- write length of field name (varint uint64)
- write field name bytes
## fields idx
- for each field
- file writing phase:
- write big endian uint64 of start offset for each field
NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size.
## fields DocValue
- for each field
- preparation phase:
- produce a slice containing multiple consecutive chunks, where each chunk is composed of a meta section followed by compressed columnar field data
- produce a slice remembering the length of each chunk
- file writing phase:
- remember the start position of this first field DocValue offset in the footer
- write out number of chunks that follow (varint uint64)
- write out length of each chunk (each a varint uint64)
- write out the byte slice containing all the chunk data
NOTE: currently the meta header inside each chunk gives clue to the location offsets and size of the data pertaining to a given docID and any
read operation leverage that meta information to extract the document specific data from the file.
## footer
- file writing phase
- write number of docs (big endian uint64)
- write stored field index location (big endian uint64)
- write field index location (big endian uint64)
- write field docValue location (big endian uint64)
- write out chunk factor (big endian uint32)
- write out version (big endian uint32)
- write out file CRC of everything preceding this (big endian uint32)

@ -0,0 +1,156 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bufio"
"math"
"os"
"github.com/couchbase/vellum"
)
const Version uint32 = 15
const Type string = "zap"
const fieldNotUninverted = math.MaxUint64
func (sb *SegmentBase) Persist(path string) error {
return PersistSegmentBase(sb, path)
}
// PersistSegmentBase persists SegmentBase in the zap file format.
func PersistSegmentBase(sb *SegmentBase, path string) error {
flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600)
if err != nil {
return err
}
cleanup := func() {
_ = f.Close()
_ = os.Remove(path)
}
br := bufio.NewWriter(f)
_, err = br.Write(sb.mem)
if err != nil {
cleanup()
return err
}
err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset,
sb.chunkMode, sb.memCRC, br)
if err != nil {
cleanup()
return err
}
err = br.Flush()
if err != nil {
cleanup()
return err
}
err = f.Sync()
if err != nil {
cleanup()
return err
}
err = f.Close()
if err != nil {
cleanup()
return err
}
return nil
}
func persistStoredFieldValues(fieldID int,
storedFieldValues [][]byte, stf []byte, spf [][]uint64,
curr int, metaEncode varintEncoder, data []byte) (
int, []byte, error) {
for i := 0; i < len(storedFieldValues); i++ {
// encode field
_, err := metaEncode(uint64(fieldID))
if err != nil {
return 0, nil, err
}
// encode type
_, err = metaEncode(uint64(stf[i]))
if err != nil {
return 0, nil, err
}
// encode start offset
_, err = metaEncode(uint64(curr))
if err != nil {
return 0, nil, err
}
// end len
_, err = metaEncode(uint64(len(storedFieldValues[i])))
if err != nil {
return 0, nil, err
}
// encode number of array pos
_, err = metaEncode(uint64(len(spf[i])))
if err != nil {
return 0, nil, err
}
// encode all array positions
for _, pos := range spf[i] {
_, err = metaEncode(pos)
if err != nil {
return 0, nil, err
}
}
data = append(data, storedFieldValues[i]...)
curr += len(storedFieldValues[i])
}
return curr, data, nil
}
func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32,
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
dictLocs []uint64) (*SegmentBase, error) {
sb := &SegmentBase{
mem: mem,
memCRC: memCRC,
chunkMode: chunkMode,
fieldsMap: fieldsMap,
fieldsInv: fieldsInv,
numDocs: numDocs,
storedIndexOffset: storedIndexOffset,
fieldsIndexOffset: fieldsIndexOffset,
docValueOffset: docValueOffset,
dictLocs: dictLocs,
fieldDvReaders: make(map[uint16]*docValueReader),
fieldFSTs: make(map[uint16]*vellum.FST),
}
sb.updateSize()
err := sb.loadDvReaders()
if err != nil {
return nil, err
}
return sb, nil
}

@ -0,0 +1,67 @@
// Copyright (c) 2019 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"fmt"
)
// LegacyChunkMode was the original chunk mode (always chunk size 1024)
// this mode is still used for chunking doc values.
var LegacyChunkMode uint32 = 1024
// DefaultChunkMode is the most recent improvement to chunking and should
// be used by default.
var DefaultChunkMode uint32 = 1026
func getChunkSize(chunkMode uint32, cardinality uint64, maxDocs uint64) (uint64, error) {
switch {
// any chunkMode <= 1024 will always chunk with chunkSize=chunkMode
case chunkMode <= 1024:
// legacy chunk size
return uint64(chunkMode), nil
case chunkMode == 1025:
// attempt at simple improvement
// theory - the point of chunking is to put a bound on the maximum number of
// calls to Next() needed to find a random document. ie, you should be able
// to do one jump to the correct chunk, and then walk through at most
// chunk-size items
// previously 1024 was chosen as the chunk size, but this is particularly
// wasteful for low cardinality terms. the observation is that if there
// are less than 1024 items, why not put them all in one chunk,
// this way you'll still achieve the same goal of visiting at most
// chunk-size items.
// no attempt is made to tweak any other case
if cardinality <= 1024 {
return maxDocs, nil
}
return 1024, nil
case chunkMode == 1026:
// improve upon the ideas tested in chunkMode 1025
// the observation that the fewest number of dense chunks is the most
// desirable layout, given the built-in assumptions of chunking
// (that we want to put an upper-bound on the number of items you must
// walk over without skipping, currently tuned to 1024)
//
// 1. compute the number of chunks needed (max 1024/chunk)
// 2. convert to chunkSize, dividing into maxDocs
numChunks := (cardinality / 1024) + 1
chunkSize := maxDocs / numChunks
return chunkSize, nil
}
return 0, fmt.Errorf("unknown chunk mode %d", chunkMode)
}

@ -0,0 +1,243 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"io"
"reflect"
"github.com/golang/snappy"
)
var reflectStaticSizeMetaData int
func init() {
var md MetaData
reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size())
}
var termSeparator byte = 0xff
var termSeparatorSplitSlice = []byte{termSeparator}
type chunkedContentCoder struct {
final []byte
chunkSize uint64
currChunk uint64
chunkLens []uint64
w io.Writer
progressiveWrite bool
chunkMetaBuf bytes.Buffer
chunkBuf bytes.Buffer
chunkMeta []MetaData
compressed []byte // temp buf for snappy compression
}
// MetaData represents the data information inside a
// chunk.
type MetaData struct {
DocNum uint64 // docNum of the data inside the chunk
DocDvOffset uint64 // offset of data inside the chunk for the given docid
}
// newChunkedContentCoder returns a new chunk content coder which
// packs data into chunks based on the provided chunkSize
func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64,
w io.Writer, progressiveWrite bool) *chunkedContentCoder {
total := maxDocNum/chunkSize + 1
rv := &chunkedContentCoder{
chunkSize: chunkSize,
chunkLens: make([]uint64, total),
chunkMeta: make([]MetaData, 0, total),
w: w,
progressiveWrite: progressiveWrite,
}
return rv
}
// Reset lets you reuse this chunked content coder. Buffers are reset
// and re used. You cannot change the chunk size.
func (c *chunkedContentCoder) Reset() {
c.currChunk = 0
c.final = c.final[:0]
c.chunkBuf.Reset()
c.chunkMetaBuf.Reset()
for i := range c.chunkLens {
c.chunkLens[i] = 0
}
c.chunkMeta = c.chunkMeta[:0]
}
func (c *chunkedContentCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) {
total := int(maxDocNum/chunkSize + 1)
c.chunkSize = chunkSize
if cap(c.chunkLens) < total {
c.chunkLens = make([]uint64, total)
} else {
c.chunkLens = c.chunkLens[:total]
}
if cap(c.chunkMeta) < total {
c.chunkMeta = make([]MetaData, 0, total)
}
}
// Close indicates you are done calling Add() this allows
// the final chunk to be encoded.
func (c *chunkedContentCoder) Close() error {
return c.flushContents()
}
func (c *chunkedContentCoder) flushContents() error {
// flush the contents, with meta information at first
buf := make([]byte, binary.MaxVarintLen64)
n := binary.PutUvarint(buf, uint64(len(c.chunkMeta)))
_, err := c.chunkMetaBuf.Write(buf[:n])
if err != nil {
return err
}
// write out the metaData slice
for _, meta := range c.chunkMeta {
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
if err != nil {
return err
}
}
// write the metadata to final data
metaData := c.chunkMetaBuf.Bytes()
c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
// write the compressed data to the final data
c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
c.final = append(c.final, c.compressed...)
c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))
if c.progressiveWrite {
_, err := c.w.Write(c.final)
if err != nil {
return err
}
c.final = c.final[:0]
}
return nil
}
// Add encodes the provided byte slice into the correct chunk for the provided
// doc num. You MUST call Add() with increasing docNums.
func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
chunk := docNum / c.chunkSize
if chunk != c.currChunk {
// flush out the previous chunk details
err := c.flushContents()
if err != nil {
return err
}
// clearing the chunk specific meta for next chunk
c.chunkBuf.Reset()
c.chunkMetaBuf.Reset()
c.chunkMeta = c.chunkMeta[:0]
c.currChunk = chunk
}
// get the starting offset for this doc
dvOffset := c.chunkBuf.Len()
dvSize, err := c.chunkBuf.Write(vals)
if err != nil {
return err
}
c.chunkMeta = append(c.chunkMeta, MetaData{
DocNum: docNum,
DocDvOffset: uint64(dvOffset + dvSize),
})
return nil
}
// Write commits all the encoded chunked contents to the provided writer.
//
// | ..... data ..... | chunk offsets (varints)
// | position of chunk offsets (uint64) | number of offsets (uint64) |
//
func (c *chunkedContentCoder) Write() (int, error) {
var tw int
if c.final != nil {
// write out the data section first
nw, err := c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
}
chunkOffsetsStart := uint64(tw)
if cap(c.final) < binary.MaxVarintLen64 {
c.final = make([]byte, binary.MaxVarintLen64)
} else {
c.final = c.final[0:binary.MaxVarintLen64]
}
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
// write out the chunk offsets
for _, chunkOffset := range chunkOffsets {
n := binary.PutUvarint(c.final, chunkOffset)
nw, err := c.w.Write(c.final[:n])
tw += nw
if err != nil {
return tw, err
}
}
chunkOffsetsLen := uint64(tw) - chunkOffsetsStart
c.final = c.final[0:8]
// write out the length of chunk offsets
binary.BigEndian.PutUint64(c.final, chunkOffsetsLen)
nw, err := c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
// write out the number of chunks
binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens)))
nw, err = c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
c.final = c.final[:0]
return tw, nil
}
// ReadDocValueBoundary elicits the start, end offsets from a
// metaData header slice
func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) {
var start uint64
if chunk > 0 {
start = metaHeaders[chunk-1].DocDvOffset
}
return start, metaHeaders[chunk].DocDvOffset
}

@ -0,0 +1,61 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"hash/crc32"
"io"
"github.com/blevesearch/bleve/index/scorch/segment"
)
// CountHashWriter is a wrapper around a Writer which counts the number of
// bytes which have been written and computes a crc32 hash
type CountHashWriter struct {
w io.Writer
crc uint32
n int
s segment.StatsReporter
}
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
func NewCountHashWriter(w io.Writer) *CountHashWriter {
return &CountHashWriter{w: w}
}
func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter {
return &CountHashWriter{w: w, s: s}
}
// Write writes the provided bytes to the wrapped writer and counts the bytes
func (c *CountHashWriter) Write(b []byte) (int, error) {
n, err := c.w.Write(b)
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n])
c.n += n
if c.s != nil {
c.s.ReportBytesWritten(uint64(n))
}
return n, err
}
// Count returns the number of bytes written
func (c *CountHashWriter) Count() int {
return c.n
}
// Sum32 returns the CRC-32 hash of the content written to this writer
func (c *CountHashWriter) Sum32() uint32 {
return c.crc
}

@ -0,0 +1,263 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"fmt"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum"
)
// Dictionary is the zap representation of the term dictionary
type Dictionary struct {
sb *SegmentBase
field string
fieldID uint16
fst *vellum.FST
fstReader *vellum.Reader
}
// PostingsList returns the postings list for the specified term
func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap,
prealloc segment.PostingsList) (segment.PostingsList, error) {
var preallocPL *PostingsList
pl, ok := prealloc.(*PostingsList)
if ok && pl != nil {
preallocPL = pl
}
return d.postingsList(term, except, preallocPL)
}
func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
if d.fstReader == nil {
if rv == nil || rv == emptyPostingsList {
return emptyPostingsList, nil
}
return d.postingsListInit(rv, except), nil
}
postingsOffset, exists, err := d.fstReader.Get(term)
if err != nil {
return nil, fmt.Errorf("vellum err: %v", err)
}
if !exists {
if rv == nil || rv == emptyPostingsList {
return emptyPostingsList, nil
}
return d.postingsListInit(rv, except), nil
}
return d.postingsListFromOffset(postingsOffset, except, rv)
}
func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
rv = d.postingsListInit(rv, except)
err := rv.read(postingsOffset, d)
if err != nil {
return nil, err
}
return rv, nil
}
func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList {
if rv == nil || rv == emptyPostingsList {
rv = &PostingsList{}
} else {
postings := rv.postings
if postings != nil {
postings.Clear()
}
*rv = PostingsList{} // clear the struct
rv.postings = postings
}
rv.sb = d.sb
rv.except = except
return rv
}
func (d *Dictionary) Contains(key []byte) (bool, error) {
return d.fst.Contains(key)
}
// Iterator returns an iterator for this dictionary
func (d *Dictionary) Iterator() segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
itr, err := d.fst.Iterator(nil, nil)
if err == nil {
rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
}
}
return rv
}
// PrefixIterator returns an iterator which only visits terms having the
// the specified prefix
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
kBeg := []byte(prefix)
kEnd := segment.IncrementBytes(kBeg)
if d.fst != nil {
itr, err := d.fst.Iterator(kBeg, kEnd)
if err == nil {
rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
}
}
return rv
}
// RangeIterator returns an iterator which only visits terms between the
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
// need to increment the end position to be inclusive
var endBytes []byte
if len(end) > 0 {
endBytes = []byte(end)
if endBytes[len(endBytes)-1] < 0xff {
endBytes[len(endBytes)-1]++
} else {
endBytes = append(endBytes, 0xff)
}
}
if d.fst != nil {
itr, err := d.fst.Iterator([]byte(start), endBytes)
if err == nil {
rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
}
}
return rv
}
// AutomatonIterator returns an iterator which only visits terms
// having the the vellum automaton and start/end key range
func (d *Dictionary) AutomatonIterator(a vellum.Automaton,
startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive)
if err == nil {
rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
}
}
return rv
}
func (d *Dictionary) OnlyIterator(onlyTerms [][]byte,
includeCount bool) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
omitCount: !includeCount,
}
var buf bytes.Buffer
builder, err := vellum.New(&buf, nil)
if err != nil {
rv.err = err
return rv
}
for _, term := range onlyTerms {
err = builder.Insert(term, 0)
if err != nil {
rv.err = err
return rv
}
}
err = builder.Close()
if err != nil {
rv.err = err
return rv
}
onlyFST, err := vellum.Load(buf.Bytes())
if err != nil {
rv.err = err
return rv
}
itr, err := d.fst.Search(onlyFST, nil, nil)
if err == nil {
rv.itr = itr
} else if err != vellum.ErrIteratorDone {
rv.err = err
}
return rv
}
// DictionaryIterator is an iterator for term dictionary
type DictionaryIterator struct {
d *Dictionary
itr vellum.Iterator
err error
tmp PostingsList
entry index.DictEntry
omitCount bool
}
// Next returns the next entry in the dictionary
func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
if i.err != nil && i.err != vellum.ErrIteratorDone {
return nil, i.err
} else if i.itr == nil || i.err == vellum.ErrIteratorDone {
return nil, nil
}
term, postingsOffset := i.itr.Current()
i.entry.Term = string(term)
if !i.omitCount {
i.err = i.tmp.read(postingsOffset, i.d)
if i.err != nil {
return nil, i.err
}
i.entry.Count = i.tmp.Count()
}
i.err = i.itr.Next()
return &i.entry, nil
}

@ -0,0 +1,312 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"fmt"
"math"
"reflect"
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
"github.com/golang/snappy"
)
var reflectStaticSizedocValueReader int
func init() {
var dvi docValueReader
reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size())
}
type docNumTermsVisitor func(docNum uint64, terms []byte) error
type docVisitState struct {
dvrs map[uint16]*docValueReader
segment *SegmentBase
}
type docValueReader struct {
field string
curChunkNum uint64
chunkOffsets []uint64
dvDataLoc uint64
curChunkHeader []MetaData
curChunkData []byte // compressed data cache
uncompressed []byte // temp buf for snappy decompression
}
func (di *docValueReader) size() int {
return reflectStaticSizedocValueReader + size.SizeOfPtr +
len(di.field) +
len(di.chunkOffsets)*size.SizeOfUint64 +
len(di.curChunkHeader)*reflectStaticSizeMetaData +
len(di.curChunkData)
}
func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader {
if rv == nil {
rv = &docValueReader{}
}
rv.field = di.field
rv.curChunkNum = math.MaxUint64
rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable
rv.dvDataLoc = di.dvDataLoc
rv.curChunkHeader = rv.curChunkHeader[:0]
rv.curChunkData = nil
rv.uncompressed = rv.uncompressed[:0]
return rv
}
func (di *docValueReader) curChunkNumber() uint64 {
return di.curChunkNum
}
func (s *SegmentBase) loadFieldDocValueReader(field string,
fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
// get the docValue offset for the given fields
if fieldDvLocStart == fieldNotUninverted {
// no docValues found, nothing to do
return nil, nil
}
// read the number of chunks, and chunk offsets position
var numChunks, chunkOffsetsPosition uint64
if fieldDvLocEnd-fieldDvLocStart > 16 {
numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd])
// read the length of chunk offsets
chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
// acquire position of chunk offsets
chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
} else {
return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart)
}
fdvIter := &docValueReader{
curChunkNum: math.MaxUint64,
field: field,
chunkOffsets: make([]uint64, int(numChunks)),
}
// read the chunk offsets
var offset uint64
for i := 0; i < int(numChunks); i++ {
loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64])
if read <= 0 {
return nil, fmt.Errorf("corrupted chunk offset during segment load")
}
fdvIter.chunkOffsets[i] = loc
offset += uint64(read)
}
// set the data offset
fdvIter.dvDataLoc = fieldDvLocStart
return fdvIter, nil
}
func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
// advance to the chunk where the docValues
// reside for the given docNum
destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
if start >= end {
di.curChunkHeader = di.curChunkHeader[:0]
di.curChunkData = nil
di.curChunkNum = chunkNumber
di.uncompressed = di.uncompressed[:0]
return nil
}
destChunkDataLoc += start
curChunkEnd += end
// read the number of docs reside in the chunk
numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
if read <= 0 {
return fmt.Errorf("failed to read the chunk")
}
chunkMetaLoc := destChunkDataLoc + uint64(read)
offset := uint64(0)
if cap(di.curChunkHeader) < int(numDocs) {
di.curChunkHeader = make([]MetaData, int(numDocs))
} else {
di.curChunkHeader = di.curChunkHeader[:int(numDocs)]
}
for i := 0; i < int(numDocs); i++ {
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
}
compressedDataLoc := chunkMetaLoc + offset
dataLength := curChunkEnd - compressedDataLoc
di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
di.curChunkNum = chunkNumber
di.uncompressed = di.uncompressed[:0]
return nil
}
func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error {
for i := 0; i < len(di.chunkOffsets); i++ {
err := di.loadDvChunk(uint64(i), s)
if err != nil {
return err
}
if di.curChunkData == nil || len(di.curChunkHeader) == 0 {
continue
}
// uncompress the already loaded data
uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
if err != nil {
return err
}
di.uncompressed = uncompressed
start := uint64(0)
for _, entry := range di.curChunkHeader {
err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset])
if err != nil {
return err
}
start = entry.DocDvOffset
}
}
return nil
}
func (di *docValueReader) visitDocValues(docNum uint64,
visitor index.DocumentFieldTermVisitor) error {
// binary search the term locations for the docNum
start, end := di.getDocValueLocs(docNum)
if start == math.MaxUint64 || end == math.MaxUint64 || start == end {
return nil
}
var uncompressed []byte
var err error
// use the uncompressed copy if available
if len(di.uncompressed) > 0 {
uncompressed = di.uncompressed
} else {
// uncompress the already loaded data
uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
if err != nil {
return err
}
di.uncompressed = uncompressed
}
// pick the terms for the given docNum
uncompressed = uncompressed[start:end]
for {
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
if i < 0 {
break
}
visitor(di.field, uncompressed[0:i])
uncompressed = uncompressed[i+1:]
}
return nil
}
func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
i := sort.Search(len(di.curChunkHeader), func(i int) bool {
return di.curChunkHeader[i].DocNum >= docNum
})
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
return ReadDocValueBoundary(i, di.curChunkHeader)
}
return math.MaxUint64, math.MaxUint64
}
// VisitDocumentFieldTerms is an implementation of the
// DocumentFieldTermVisitable interface
func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) (
segment.DocVisitState, error) {
dvs, ok := dvsIn.(*docVisitState)
if !ok || dvs == nil {
dvs = &docVisitState{}
} else {
if dvs.segment != s {
dvs.segment = s
dvs.dvrs = nil
}
}
var fieldIDPlus1 uint16
if dvs.dvrs == nil {
dvs.dvrs = make(map[uint16]*docValueReader, len(fields))
for _, field := range fields {
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
continue
}
fieldID := fieldIDPlus1 - 1
if dvIter, exists := s.fieldDvReaders[fieldID]; exists &&
dvIter != nil {
dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID])
}
}
}
// find the chunkNumber where the docValues are stored
// NOTE: doc values continue to use legacy chunk mode
chunkFactor, err := getChunkSize(LegacyChunkMode, 0, 0)
if err != nil {
return nil, err
}
docInChunk := localDocNum / chunkFactor
var dvr *docValueReader
for _, field := range fields {
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
continue
}
fieldID := fieldIDPlus1 - 1
if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
// check if the chunk is already loaded
if docInChunk != dvr.curChunkNumber() {
err := dvr.loadDvChunk(docInChunk, s)
if err != nil {
return dvs, err
}
}
_ = dvr.visitDocValues(localDocNum, visitor)
}
}
return dvs, nil
}
// VisitableDocValueFields returns the list of fields with
// persisted doc value terms ready to be visitable using the
// VisitDocumentFieldTerms method.
func (s *SegmentBase) VisitableDocValueFields() ([]string, error) {
return s.fieldDvNames, nil
}

@ -0,0 +1,138 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"github.com/couchbase/vellum"
)
// enumerator provides an ordered traversal of multiple vellum
// iterators. Like JOIN of iterators, the enumerator produces a
// sequence of (key, iteratorIndex, value) tuples, sorted by key ASC,
// then iteratorIndex ASC, where the same key might be seen or
// repeated across multiple child iterators.
type enumerator struct {
itrs []vellum.Iterator
currKs [][]byte
currVs []uint64
lowK []byte
lowIdxs []int
lowCurr int
}
// newEnumerator returns a new enumerator over the vellum Iterators
func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) {
rv := &enumerator{
itrs: itrs,
currKs: make([][]byte, len(itrs)),
currVs: make([]uint64, len(itrs)),
lowIdxs: make([]int, 0, len(itrs)),
}
for i, itr := range rv.itrs {
rv.currKs[i], rv.currVs[i] = itr.Current()
}
rv.updateMatches(false)
if rv.lowK == nil && len(rv.lowIdxs) == 0 {
return rv, vellum.ErrIteratorDone
}
return rv, nil
}
// updateMatches maintains the low key matches based on the currKs
func (m *enumerator) updateMatches(skipEmptyKey bool) {
m.lowK = nil
m.lowIdxs = m.lowIdxs[:0]
m.lowCurr = 0
for i, key := range m.currKs {
if (key == nil && m.currVs[i] == 0) || // in case of empty iterator
(len(key) == 0 && skipEmptyKey) { // skip empty keys
continue
}
cmp := bytes.Compare(key, m.lowK)
if cmp < 0 || len(m.lowIdxs) == 0 {
// reached a new low
m.lowK = key
m.lowIdxs = m.lowIdxs[:0]
m.lowIdxs = append(m.lowIdxs, i)
} else if cmp == 0 {
m.lowIdxs = append(m.lowIdxs, i)
}
}
}
// Current returns the enumerator's current key, iterator-index, and
// value. If the enumerator is not pointing at a valid value (because
// Next returned an error previously), Current will return nil,0,0.
func (m *enumerator) Current() ([]byte, int, uint64) {
var i int
var v uint64
if m.lowCurr < len(m.lowIdxs) {
i = m.lowIdxs[m.lowCurr]
v = m.currVs[i]
}
return m.lowK, i, v
}
// GetLowIdxsAndValues will return all of the iterator indices
// which point to the current key, and their corresponding
// values. This can be used by advanced caller which may need
// to peek into these other sets of data before processing.
func (m *enumerator) GetLowIdxsAndValues() ([]int, []uint64) {
values := make([]uint64, 0, len(m.lowIdxs))
for _, idx := range m.lowIdxs {
values = append(values, m.currVs[idx])
}
return m.lowIdxs, values
}
// Next advances the enumerator to the next key/iterator/value result,
// else vellum.ErrIteratorDone is returned.
func (m *enumerator) Next() error {
m.lowCurr += 1
if m.lowCurr >= len(m.lowIdxs) {
// move all the current low iterators forwards
for _, vi := range m.lowIdxs {
err := m.itrs[vi].Next()
if err != nil && err != vellum.ErrIteratorDone {
return err
}
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current()
}
// can skip any empty keys encountered at this point
m.updateMatches(true)
}
if m.lowK == nil && len(m.lowIdxs) == 0 {
return vellum.ErrIteratorDone
}
return nil
}
// Close all the underlying Iterators. The first error, if any, will
// be returned.
func (m *enumerator) Close() error {
var rv error
for _, itr := range m.itrs {
err := itr.Close()
if rv == nil {
rv = err
}
}
return rv
}

@ -0,0 +1,12 @@
module github.com/blevesearch/zap/v15
go 1.12
require (
github.com/RoaringBitmap/roaring v0.4.23
github.com/blevesearch/bleve v1.0.12
github.com/blevesearch/mmap-go v1.0.2
github.com/couchbase/vellum v1.0.2
github.com/golang/snappy v0.0.1
github.com/spf13/cobra v0.0.5
)

@ -0,0 +1,118 @@
// Copyright (c) 2019 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"encoding/binary"
"fmt"
"github.com/blevesearch/bleve/index/scorch/segment"
)
type chunkedIntDecoder struct {
startOffset uint64
dataStartOffset uint64
chunkOffsets []uint64
curChunkBytes []byte
data []byte
r *segment.MemUvarintReader
}
// newChunkedIntDecoder expects an optional or reset chunkedIntDecoder for better reuse.
func newChunkedIntDecoder(buf []byte, offset uint64, rv *chunkedIntDecoder) *chunkedIntDecoder {
if rv == nil {
rv = &chunkedIntDecoder{startOffset: offset, data: buf}
} else {
rv.startOffset = offset
rv.data = buf
}
var n, numChunks uint64
var read int
if offset == termNotEncoded {
numChunks = 0
} else {
numChunks, read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64])
}
n += uint64(read)
if cap(rv.chunkOffsets) >= int(numChunks) {
rv.chunkOffsets = rv.chunkOffsets[:int(numChunks)]
} else {
rv.chunkOffsets = make([]uint64, int(numChunks))
}
for i := 0; i < int(numChunks); i++ {
rv.chunkOffsets[i], read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64])
n += uint64(read)
}
rv.dataStartOffset = offset + n
return rv
}
func (d *chunkedIntDecoder) loadChunk(chunk int) error {
if d.startOffset == termNotEncoded {
d.r = segment.NewMemUvarintReader([]byte(nil))
return nil
}
if chunk >= len(d.chunkOffsets) {
return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)",
chunk, len(d.chunkOffsets))
}
end, start := d.dataStartOffset, d.dataStartOffset
s, e := readChunkBoundary(chunk, d.chunkOffsets)
start += s
end += e
d.curChunkBytes = d.data[start:end]
if d.r == nil {
d.r = segment.NewMemUvarintReader(d.curChunkBytes)
} else {
d.r.Reset(d.curChunkBytes)
}
return nil
}
func (d *chunkedIntDecoder) reset() {
d.startOffset = 0
d.dataStartOffset = 0
d.chunkOffsets = d.chunkOffsets[:0]
d.curChunkBytes = d.curChunkBytes[:0]
d.data = d.data[:0]
if d.r != nil {
d.r.Reset([]byte(nil))
}
}
func (d *chunkedIntDecoder) isNil() bool {
return d.curChunkBytes == nil || len(d.curChunkBytes) == 0
}
func (d *chunkedIntDecoder) readUvarint() (uint64, error) {
return d.r.ReadUvarint()
}
func (d *chunkedIntDecoder) SkipUvarint() {
d.r.SkipUvarint()
}
func (d *chunkedIntDecoder) SkipBytes(count int) {
d.r.SkipBytes(count)
}
func (d *chunkedIntDecoder) Len() int {
return d.r.Len()
}

@ -0,0 +1,206 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"io"
)
// We can safely use 0 to represent termNotEncoded since 0
// could never be a valid address for term location information.
// (stored field index is always non-empty and earlier in the
// file)
const termNotEncoded = 0
type chunkedIntCoder struct {
final []byte
chunkSize uint64
chunkBuf bytes.Buffer
chunkLens []uint64
currChunk uint64
buf []byte
}
// newChunkedIntCoder returns a new chunk int coder which packs data into
// chunks based on the provided chunkSize and supports up to the specified
// maxDocNum
func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder {
total := maxDocNum/chunkSize + 1
rv := &chunkedIntCoder{
chunkSize: chunkSize,
chunkLens: make([]uint64, total),
final: make([]byte, 0, 64),
}
return rv
}
// Reset lets you reuse this chunked int coder. buffers are reset and reused
// from previous use. you cannot change the chunk size or max doc num.
func (c *chunkedIntCoder) Reset() {
c.final = c.final[:0]
c.chunkBuf.Reset()
c.currChunk = 0
for i := range c.chunkLens {
c.chunkLens[i] = 0
}
}
// SetChunkSize changes the chunk size. It is only valid to do so
// with a new chunkedIntCoder, or immediately after calling Reset()
func (c *chunkedIntCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) {
total := int(maxDocNum/chunkSize + 1)
c.chunkSize = chunkSize
if cap(c.chunkLens) < total {
c.chunkLens = make([]uint64, total)
} else {
c.chunkLens = c.chunkLens[:total]
}
}
// Add encodes the provided integers into the correct chunk for the provided
// doc num. You MUST call Add() with increasing docNums.
func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
chunk := docNum / c.chunkSize
if chunk != c.currChunk {
// starting a new chunk
c.Close()
c.chunkBuf.Reset()
c.currChunk = chunk
}
if len(c.buf) < binary.MaxVarintLen64 {
c.buf = make([]byte, binary.MaxVarintLen64)
}
for _, val := range vals {
wb := binary.PutUvarint(c.buf, val)
_, err := c.chunkBuf.Write(c.buf[:wb])
if err != nil {
return err
}
}
return nil
}
func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error {
chunk := docNum / c.chunkSize
if chunk != c.currChunk {
// starting a new chunk
c.Close()
c.chunkBuf.Reset()
c.currChunk = chunk
}
_, err := c.chunkBuf.Write(buf)
return err
}
// Close indicates you are done calling Add() this allows the final chunk
// to be encoded.
func (c *chunkedIntCoder) Close() {
encodingBytes := c.chunkBuf.Bytes()
c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
c.final = append(c.final, encodingBytes...)
c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close
}
// Write commits all the encoded chunked integers to the provided writer.
func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
bufNeeded := binary.MaxVarintLen64 * (1 + len(c.chunkLens))
if len(c.buf) < bufNeeded {
c.buf = make([]byte, bufNeeded)
}
buf := c.buf
// convert the chunk lengths into chunk offsets
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
// write out the number of chunks & each chunk offsets
n := binary.PutUvarint(buf, uint64(len(chunkOffsets)))
for _, chunkOffset := range chunkOffsets {
n += binary.PutUvarint(buf[n:], chunkOffset)
}
tw, err := w.Write(buf[:n])
if err != nil {
return tw, err
}
// write out the data
nw, err := w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
return tw, nil
}
// writeAt commits all the encoded chunked integers to the provided writer
// and returns the starting offset, total bytes written and an error
func (c *chunkedIntCoder) writeAt(w io.Writer) (uint64, int, error) {
startOffset := uint64(termNotEncoded)
if len(c.final) <= 0 {
return startOffset, 0, nil
}
if chw := w.(*CountHashWriter); chw != nil {
startOffset = uint64(chw.Count())
}
tw, err := c.Write(w)
return startOffset, tw, err
}
func (c *chunkedIntCoder) FinalSize() int {
return len(c.final)
}
// modifyLengthsToEndOffsets converts the chunk length array
// to a chunk offset array. The readChunkBoundary
// will figure out the start and end of every chunk from
// these offsets. Starting offset of i'th index is stored
// in i-1'th position except for 0'th index and ending offset
// is stored at i'th index position.
// For 0'th element, starting position is always zero.
// eg:
// Lens -> 5 5 5 5 => 5 10 15 20
// Lens -> 0 5 0 5 => 0 5 5 10
// Lens -> 0 0 0 5 => 0 0 0 5
// Lens -> 5 0 0 0 => 5 5 5 5
// Lens -> 0 5 0 0 => 0 5 5 5
// Lens -> 0 0 5 0 => 0 0 5 5
func modifyLengthsToEndOffsets(lengths []uint64) []uint64 {
var runningOffset uint64
var index, i int
for i = 1; i <= len(lengths); i++ {
runningOffset += lengths[i-1]
lengths[index] = runningOffset
index++
}
return lengths
}
func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) {
var start uint64
if chunk > 0 {
start = offsets[chunk-1]
}
return start, offsets[chunk]
}

@ -0,0 +1,852 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bufio"
"bytes"
"encoding/binary"
"fmt"
"math"
"os"
"sort"
"github.com/RoaringBitmap/roaring"
seg "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum"
"github.com/golang/snappy"
)
var DefaultFileMergerBufferSize = 1024 * 1024
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
// Merge takes a slice of segments and bit masks describing which
// documents may be dropped, and creates a new segment containing the
// remaining data. This new segment is built at the specified path.
func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string,
closeCh chan struct{}, s seg.StatsReporter) (
[][]uint64, uint64, error) {
segmentBases := make([]*SegmentBase, len(segments))
for segmenti, segment := range segments {
switch segmentx := segment.(type) {
case *Segment:
segmentBases[segmenti] = &segmentx.SegmentBase
case *SegmentBase:
segmentBases[segmenti] = segmentx
default:
panic(fmt.Sprintf("oops, unexpected segment type: %T", segment))
}
}
return mergeSegmentBases(segmentBases, drops, path, DefaultChunkMode, closeCh, s)
}
func mergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string,
chunkMode uint32, closeCh chan struct{}, s seg.StatsReporter) (
[][]uint64, uint64, error) {
flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600)
if err != nil {
return nil, 0, err
}
cleanup := func() {
_ = f.Close()
_ = os.Remove(path)
}
// buffer the output
br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize)
// wrap it for counting (tracking offsets)
cr := NewCountHashWriterWithStatsReporter(br, s)
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err :=
MergeToWriter(segmentBases, drops, chunkMode, cr, closeCh)
if err != nil {
cleanup()
return nil, 0, err
}
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset,
docValueOffset, chunkMode, cr.Sum32(), cr)
if err != nil {
cleanup()
return nil, 0, err
}
err = br.Flush()
if err != nil {
cleanup()
return nil, 0, err
}
err = f.Sync()
if err != nil {
cleanup()
return nil, 0, err
}
err = f.Close()
if err != nil {
cleanup()
return nil, 0, err
}
return newDocNums, uint64(cr.Count()), nil
}
func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
chunkMode uint32, cr *CountHashWriter, closeCh chan struct{}) (
newDocNums [][]uint64,
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16,
err error) {
docValueOffset = uint64(fieldNotUninverted)
var fieldsSame bool
fieldsSame, fieldsInv = mergeFields(segments)
fieldsMap = mapFields(fieldsInv)
numDocs = computeNewDocCount(segments, drops)
if isClosed(closeCh) {
return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed
}
if numDocs > 0 {
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh)
if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err
}
dictLocs, docValueOffset, err = persistMergedRest(segments, drops,
fieldsInv, fieldsMap, fieldsSame,
newDocNums, numDocs, chunkMode, cr, closeCh)
if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err
}
} else {
dictLocs = make([]uint64, len(fieldsInv))
}
fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs)
if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err
}
return newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, nil
}
// mapFields takes the fieldsInv list and returns a map of fieldName
// to fieldID+1
func mapFields(fields []string) map[string]uint16 {
rv := make(map[string]uint16, len(fields))
for i, fieldName := range fields {
rv[fieldName] = uint16(i) + 1
}
return rv
}
// computeNewDocCount determines how many documents will be in the newly
// merged segment when obsoleted docs are dropped
func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 {
var newDocCount uint64
for segI, segment := range segments {
newDocCount += segment.numDocs
if drops[segI] != nil {
newDocCount -= drops[segI].GetCardinality()
}
}
return newDocCount
}
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkMode uint32,
w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) {
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
var bufLoc []uint64
var postings *PostingsList
var postItr *PostingsIterator
rv := make([]uint64, len(fieldsInv))
fieldDvLocsStart := make([]uint64, len(fieldsInv))
fieldDvLocsEnd := make([]uint64, len(fieldsInv))
// these int coders are initialized with chunk size 1024
// however this will be reset to the correct chunk size
// while processing each individual field-term section
tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1)
locEncoder := newChunkedIntCoder(1024, newSegDocCount-1)
var vellumBuf bytes.Buffer
newVellum, err := vellum.New(&vellumBuf, nil)
if err != nil {
return nil, 0, err
}
newRoaring := roaring.NewBitmap()
// for each field
for fieldID, fieldName := range fieldsInv {
// collect FST iterators from all active segments for this field
var newDocNums [][]uint64
var drops []*roaring.Bitmap
var dicts []*Dictionary
var itrs []vellum.Iterator
var segmentsInFocus []*SegmentBase
for segmentI, segment := range segments {
// check for the closure in meantime
if isClosed(closeCh) {
return nil, 0, seg.ErrClosed
}
dict, err2 := segment.dictionary(fieldName)
if err2 != nil {
return nil, 0, err2
}
if dict != nil && dict.fst != nil {
itr, err2 := dict.fst.Iterator(nil, nil)
if err2 != nil && err2 != vellum.ErrIteratorDone {
return nil, 0, err2
}
if itr != nil {
newDocNums = append(newDocNums, newDocNumsIn[segmentI])
if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() {
drops = append(drops, dropsIn[segmentI])
} else {
drops = append(drops, nil)
}
dicts = append(dicts, dict)
itrs = append(itrs, itr)
segmentsInFocus = append(segmentsInFocus, segment)
}
}
}
var prevTerm []byte
newRoaring.Clear()
var lastDocNum, lastFreq, lastNorm uint64
// determines whether to use "1-hit" encoding optimization
// when a term appears in only 1 doc, with no loc info,
// has freq of 1, and the docNum fits into 31-bits
use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) {
if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 {
docNum := uint64(newRoaring.Minimum())
if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 {
return true, docNum, lastNorm
}
}
return false, 0, 0
}
finishTerm := func(term []byte) error {
tfEncoder.Close()
locEncoder.Close()
postingsOffset, err := writePostings(newRoaring,
tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64)
if err != nil {
return err
}
if postingsOffset > 0 {
err = newVellum.Insert(term, postingsOffset)
if err != nil {
return err
}
}
newRoaring.Clear()
tfEncoder.Reset()
locEncoder.Reset()
lastDocNum = 0
lastFreq = 0
lastNorm = 0
return nil
}
enumerator, err := newEnumerator(itrs)
for err == nil {
term, itrI, postingsOffset := enumerator.Current()
if !bytes.Equal(prevTerm, term) {
// check for the closure in meantime
if isClosed(closeCh) {
return nil, 0, seg.ErrClosed
}
// if the term changed, write out the info collected
// for the previous term
err = finishTerm(prevTerm)
if err != nil {
return nil, 0, err
}
}
if !bytes.Equal(prevTerm, term) || prevTerm == nil {
// compute cardinality of field-term in new seg
var newCard uint64
lowItrIdxs, lowItrVals := enumerator.GetLowIdxsAndValues()
for i, idx := range lowItrIdxs {
pl, err := dicts[idx].postingsListFromOffset(lowItrVals[i], drops[idx], nil)
if err != nil {
return nil, 0, err
}
newCard += pl.Count()
}
// compute correct chunk size with this
chunkSize, err := getChunkSize(chunkMode, newCard, newSegDocCount)
if err != nil {
return nil, 0, err
}
// update encoders chunk
tfEncoder.SetChunkSize(chunkSize, newSegDocCount-1)
locEncoder.SetChunkSize(chunkSize, newSegDocCount-1)
}
postings, err = dicts[itrI].postingsListFromOffset(
postingsOffset, drops[itrI], postings)
if err != nil {
return nil, 0, err
}
postItr = postings.iterator(true, true, true, postItr)
// can no longer optimize by copying, since chunk factor could have changed
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
fieldsMap, term, postItr, newDocNums[itrI], newRoaring,
tfEncoder, locEncoder, bufLoc)
if err != nil {
return nil, 0, err
}
prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem
prevTerm = append(prevTerm, term...)
err = enumerator.Next()
}
if err != vellum.ErrIteratorDone {
return nil, 0, err
}
err = finishTerm(prevTerm)
if err != nil {
return nil, 0, err
}
dictOffset := uint64(w.Count())
err = newVellum.Close()
if err != nil {
return nil, 0, err
}
vellumData := vellumBuf.Bytes()
// write out the length of the vellum data
n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData)))
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return nil, 0, err
}
// write this vellum to disk
_, err = w.Write(vellumData)
if err != nil {
return nil, 0, err
}
rv[fieldID] = dictOffset
// get the field doc value offset (start)
fieldDvLocsStart[fieldID] = uint64(w.Count())
// update the field doc values
// NOTE: doc values continue to use legacy chunk mode
chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0)
if err != nil {
return nil, 0, err
}
fdvEncoder := newChunkedContentCoder(chunkSize, newSegDocCount-1, w, true)
fdvReadersAvailable := false
var dvIterClone *docValueReader
for segmentI, segment := range segmentsInFocus {
// check for the closure in meantime
if isClosed(closeCh) {
return nil, 0, seg.ErrClosed
}
fieldIDPlus1 := uint16(segment.fieldsMap[fieldName])
if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists &&
dvIter != nil {
fdvReadersAvailable = true
dvIterClone = dvIter.cloneInto(dvIterClone)
err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error {
if newDocNums[segmentI][docNum] == docDropped {
return nil
}
err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms)
if err != nil {
return err
}
return nil
})
if err != nil {
return nil, 0, err
}
}
}
if fdvReadersAvailable {
err = fdvEncoder.Close()
if err != nil {
return nil, 0, err
}
// persist the doc value details for this field
_, err = fdvEncoder.Write()
if err != nil {
return nil, 0, err
}
// get the field doc value offset (end)
fieldDvLocsEnd[fieldID] = uint64(w.Count())
} else {
fieldDvLocsStart[fieldID] = fieldNotUninverted
fieldDvLocsEnd[fieldID] = fieldNotUninverted
}
// reset vellum buffer and vellum builder
vellumBuf.Reset()
err = newVellum.Reset(&vellumBuf)
if err != nil {
return nil, 0, err
}
}
fieldDvLocsOffset := uint64(w.Count())
buf := bufMaxVarintLen64
for i := 0; i < len(fieldDvLocsStart); i++ {
n := binary.PutUvarint(buf, fieldDvLocsStart[i])
_, err := w.Write(buf[:n])
if err != nil {
return nil, 0, err
}
n = binary.PutUvarint(buf, fieldDvLocsEnd[i])
_, err = w.Write(buf[:n])
if err != nil {
return nil, 0, err
}
}
return rv, fieldDvLocsOffset, nil
}
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator,
newDocNums []uint64, newRoaring *roaring.Bitmap,
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) (
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) {
next, err := postItr.Next()
for next != nil && err == nil {
hitNewDocNum := newDocNums[next.Number()]
if hitNewDocNum == docDropped {
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum")
}
newRoaring.Add(uint32(hitNewDocNum))
nextFreq := next.Frequency()
var nextNorm uint64
if pi, ok := next.(*Posting); ok {
nextNorm = pi.NormUint64()
} else {
return 0, 0, 0, nil, fmt.Errorf("unexpected posting type %T", next)
}
locs := next.Locations()
err = tfEncoder.Add(hitNewDocNum,
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm)
if err != nil {
return 0, 0, 0, nil, err
}
if len(locs) > 0 {
numBytesLocs := 0
for _, loc := range locs {
ap := loc.ArrayPositions()
numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1),
loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap)
}
err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs))
if err != nil {
return 0, 0, 0, nil, err
}
for _, loc := range locs {
ap := loc.ArrayPositions()
if cap(bufLoc) < 5+len(ap) {
bufLoc = make([]uint64, 0, 5+len(ap))
}
args := bufLoc[0:5]
args[0] = uint64(fieldsMap[loc.Field()] - 1)
args[1] = loc.Pos()
args[2] = loc.Start()
args[3] = loc.End()
args[4] = uint64(len(ap))
args = append(args, ap...)
err = locEncoder.Add(hitNewDocNum, args...)
if err != nil {
return 0, 0, 0, nil, err
}
}
}
lastDocNum = hitNewDocNum
lastFreq = nextFreq
lastNorm = nextNorm
next, err = postItr.Next()
}
return lastDocNum, lastFreq, lastNorm, bufLoc, err
}
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder,
use1HitEncoding func(uint64) (bool, uint64, uint64),
w *CountHashWriter, bufMaxVarintLen64 []byte) (
offset uint64, err error) {
termCardinality := postings.GetCardinality()
if termCardinality <= 0 {
return 0, nil
}
if use1HitEncoding != nil {
encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality)
if encodeAs1Hit {
return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil
}
}
var tfOffset uint64
tfOffset, _, err = tfEncoder.writeAt(w)
if err != nil {
return 0, err
}
var locOffset uint64
locOffset, _, err = locEncoder.writeAt(w)
if err != nil {
return 0, err
}
postingsOffset := uint64(w.Count())
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return 0, err
}
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return 0, err
}
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64)
if err != nil {
return 0, err
}
return postingsOffset, nil
}
type varintEncoder func(uint64) (int, error)
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) {
var rv [][]uint64 // The remapped or newDocNums for each segment.
var newDocNum uint64
var curr int
var data, compressed []byte
var metaBuf bytes.Buffer
varBuf := make([]byte, binary.MaxVarintLen64)
metaEncode := func(val uint64) (int, error) {
wb := binary.PutUvarint(varBuf, val)
return metaBuf.Write(varBuf[:wb])
}
vals := make([][][]byte, len(fieldsInv))
typs := make([][]byte, len(fieldsInv))
poss := make([][][]uint64, len(fieldsInv))
var posBuf []uint64
docNumOffsets := make([]uint64, newSegDocCount)
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
defer visitDocumentCtxPool.Put(vdc)
// for each segment
for segI, segment := range segments {
// check for the closure in meantime
if isClosed(closeCh) {
return 0, nil, seg.ErrClosed
}
segNewDocNums := make([]uint64, segment.numDocs)
dropsI := drops[segI]
// optimize when the field mapping is the same across all
// segments and there are no deletions, via byte-copying
// of stored docs bytes directly to the writer
if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) {
err := segment.copyStoredDocs(newDocNum, docNumOffsets, w)
if err != nil {
return 0, nil, err
}
for i := uint64(0); i < segment.numDocs; i++ {
segNewDocNums[i] = newDocNum
newDocNum++
}
rv = append(rv, segNewDocNums)
continue
}
// for each doc num
for docNum := uint64(0); docNum < segment.numDocs; docNum++ {
// TODO: roaring's API limits docNums to 32-bits?
if dropsI != nil && dropsI.Contains(uint32(docNum)) {
segNewDocNums[docNum] = docDropped
continue
}
segNewDocNums[docNum] = newDocNum
curr = 0
metaBuf.Reset()
data = data[:0]
posTemp := posBuf
// collect all the data
for i := 0; i < len(fieldsInv); i++ {
vals[i] = vals[i][:0]
typs[i] = typs[i][:0]
poss[i] = poss[i][:0]
}
err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
fieldID := int(fieldsMap[field]) - 1
vals[fieldID] = append(vals[fieldID], value)
typs[fieldID] = append(typs[fieldID], typ)
// copy array positions to preserve them beyond the scope of this callback
var curPos []uint64
if len(pos) > 0 {
if cap(posTemp) < len(pos) {
posBuf = make([]uint64, len(pos)*len(fieldsInv))
posTemp = posBuf
}
curPos = posTemp[0:len(pos)]
copy(curPos, pos)
posTemp = posTemp[len(pos):]
}
poss[fieldID] = append(poss[fieldID], curPos)
return true
})
if err != nil {
return 0, nil, err
}
// _id field special case optimizes ExternalID() lookups
idFieldVal := vals[uint16(0)][0]
_, err = metaEncode(uint64(len(idFieldVal)))
if err != nil {
return 0, nil, err
}
// now walk the non-"_id" fields in order
for fieldID := 1; fieldID < len(fieldsInv); fieldID++ {
storedFieldValues := vals[fieldID]
stf := typs[fieldID]
spf := poss[fieldID]
var err2 error
curr, data, err2 = persistStoredFieldValues(fieldID,
storedFieldValues, stf, spf, curr, metaEncode, data)
if err2 != nil {
return 0, nil, err2
}
}
metaBytes := metaBuf.Bytes()
compressed = snappy.Encode(compressed[:cap(compressed)], data)
// record where we're about to start writing
docNumOffsets[newDocNum] = uint64(w.Count())
// write out the meta len and compressed data len
_, err = writeUvarints(w,
uint64(len(metaBytes)),
uint64(len(idFieldVal)+len(compressed)))
if err != nil {
return 0, nil, err
}
// now write the meta
_, err = w.Write(metaBytes)
if err != nil {
return 0, nil, err
}
// now write the _id field val (counted as part of the 'compressed' data)
_, err = w.Write(idFieldVal)
if err != nil {
return 0, nil, err
}
// now write the compressed data
_, err = w.Write(compressed)
if err != nil {
return 0, nil, err
}
newDocNum++
}
rv = append(rv, segNewDocNums)
}
// return value is the start of the stored index
storedIndexOffset := uint64(w.Count())
// now write out the stored doc index
for _, docNumOffset := range docNumOffsets {
err := binary.Write(w, binary.BigEndian, docNumOffset)
if err != nil {
return 0, nil, err
}
}
return storedIndexOffset, rv, nil
}
// copyStoredDocs writes out a segment's stored doc info, optimized by
// using a single Write() call for the entire set of bytes. The
// newDocNumOffsets is filled with the new offsets for each doc.
func (s *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64,
w *CountHashWriter) error {
if s.numDocs <= 0 {
return nil
}
indexOffset0, storedOffset0, _, _, _ :=
s.getDocStoredOffsets(0) // the segment's first doc
indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN :=
s.getDocStoredOffsets(s.numDocs - 1) // the segment's last doc
storedOffset0New := uint64(w.Count())
storedBytes := s.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN]
_, err := w.Write(storedBytes)
if err != nil {
return err
}
// remap the storedOffset's for the docs into new offsets relative
// to storedOffset0New, filling the given docNumOffsetsOut array
for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 {
storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8])
storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New
newDocNumOffsets[newDocNum] = storedOffsetNew
newDocNum += 1
}
return nil
}
// mergeFields builds a unified list of fields used across all the
// input segments, and computes whether the fields are the same across
// segments (which depends on fields to be sorted in the same way
// across segments)
func mergeFields(segments []*SegmentBase) (bool, []string) {
fieldsSame := true
var segment0Fields []string
if len(segments) > 0 {
segment0Fields = segments[0].Fields()
}
fieldsExist := map[string]struct{}{}
for _, segment := range segments {
fields := segment.Fields()
for fieldi, field := range fields {
fieldsExist[field] = struct{}{}
if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field {
fieldsSame = false
}
}
}
rv := make([]string, 0, len(fieldsExist))
// ensure _id stays first
rv = append(rv, "_id")
for k := range fieldsExist {
if k != "_id" {
rv = append(rv, k)
}
}
sort.Strings(rv[1:]) // leave _id as first
return fieldsSame, rv
}
func isClosed(closeCh chan struct{}) bool {
select {
case <-closeCh:
return true
default:
return false
}
}

@ -0,0 +1,860 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"math"
"sort"
"sync"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum"
"github.com/golang/snappy"
)
var NewSegmentBufferNumResultsBump int = 100
var NewSegmentBufferNumResultsFactor float64 = 1.0
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
// ValidateDocFields can be set by applications to perform additional checks
// on fields in a document being added to a new segment, by default it does
// nothing.
// This API is experimental and may be removed at any time.
var ValidateDocFields = func(field document.Field) error {
return nil
}
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
// SegmentBase from analysis results
func (z *ZapPlugin) New(results []*index.AnalysisResult) (
segment.Segment, uint64, error) {
return z.newWithChunkMode(results, DefaultChunkMode)
}
func (*ZapPlugin) newWithChunkMode(results []*index.AnalysisResult,
chunkMode uint32) (segment.Segment, uint64, error) {
s := interimPool.Get().(*interim)
var br bytes.Buffer
if s.lastNumDocs > 0 {
// use previous results to initialize the buf with an estimate
// size, but note that the interim instance comes from a
// global interimPool, so multiple scorch instances indexing
// different docs can lead to low quality estimates
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
NewSegmentBufferNumResultsFactor)
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
NewSegmentBufferAvgBytesPerDocFactor)
br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
}
s.results = results
s.chunkMode = chunkMode
s.w = NewCountHashWriter(&br)
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
err := s.convert()
if err != nil {
return nil, uint64(0), err
}
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode,
s.FieldsMap, s.FieldsInv, uint64(len(results)),
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
if err == nil && s.reset() == nil {
s.lastNumDocs = len(results)
s.lastOutSize = len(br.Bytes())
interimPool.Put(s)
}
return sb, uint64(len(br.Bytes())), err
}
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
// interim holds temporary working data used while converting from
// analysis results to a zap-encoded segment
type interim struct {
results []*index.AnalysisResult
chunkMode uint32
w *CountHashWriter
// FieldsMap adds 1 to field id to avoid zero value issues
// name -> field id + 1
FieldsMap map[string]uint16
// FieldsInv is the inverse of FieldsMap
// field id -> name
FieldsInv []string
// Term dictionaries for each field
// field id -> term -> postings list id + 1
Dicts []map[string]uint64
// Terms for each field, where terms are sorted ascending
// field id -> []term
DictKeys [][]string
// Fields whose IncludeDocValues is true
// field id -> bool
IncludeDocValues []bool
// postings id -> bitmap of docNums
Postings []*roaring.Bitmap
// postings id -> freq/norm's, one for each docNum in postings
FreqNorms [][]interimFreqNorm
freqNormsBacking []interimFreqNorm
// postings id -> locs, one for each freq
Locs [][]interimLoc
locsBacking []interimLoc
numTermsPerPostingsList []int // key is postings list id
numLocsPerPostingsList []int // key is postings list id
builder *vellum.Builder
builderBuf bytes.Buffer
metaBuf bytes.Buffer
tmp0 []byte
tmp1 []byte
lastNumDocs int
lastOutSize int
}
func (s *interim) reset() (err error) {
s.results = nil
s.chunkMode = 0
s.w = nil
s.FieldsMap = nil
s.FieldsInv = nil
for i := range s.Dicts {
s.Dicts[i] = nil
}
s.Dicts = s.Dicts[:0]
for i := range s.DictKeys {
s.DictKeys[i] = s.DictKeys[i][:0]
}
s.DictKeys = s.DictKeys[:0]
for i := range s.IncludeDocValues {
s.IncludeDocValues[i] = false
}
s.IncludeDocValues = s.IncludeDocValues[:0]
for _, idn := range s.Postings {
idn.Clear()
}
s.Postings = s.Postings[:0]
s.FreqNorms = s.FreqNorms[:0]
for i := range s.freqNormsBacking {
s.freqNormsBacking[i] = interimFreqNorm{}
}
s.freqNormsBacking = s.freqNormsBacking[:0]
s.Locs = s.Locs[:0]
for i := range s.locsBacking {
s.locsBacking[i] = interimLoc{}
}
s.locsBacking = s.locsBacking[:0]
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
s.builderBuf.Reset()
if s.builder != nil {
err = s.builder.Reset(&s.builderBuf)
}
s.metaBuf.Reset()
s.tmp0 = s.tmp0[:0]
s.tmp1 = s.tmp1[:0]
s.lastNumDocs = 0
s.lastOutSize = 0
return err
}
func (s *interim) grabBuf(size int) []byte {
buf := s.tmp0
if cap(buf) < size {
buf = make([]byte, size)
s.tmp0 = buf
}
return buf[0:size]
}
type interimStoredField struct {
vals [][]byte
typs []byte
arrayposs [][]uint64 // array positions
}
type interimFreqNorm struct {
freq uint64
norm float32
numLocs int
}
type interimLoc struct {
fieldID uint16
pos uint64
start uint64
end uint64
arrayposs []uint64
}
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
s.FieldsMap = map[string]uint16{}
s.getOrDefineField("_id") // _id field is fieldID 0
for _, result := range s.results {
for _, field := range result.Document.CompositeFields {
s.getOrDefineField(field.Name())
}
for _, field := range result.Document.Fields {
s.getOrDefineField(field.Name())
}
}
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
for fieldID, fieldName := range s.FieldsInv {
s.FieldsMap[fieldName] = uint16(fieldID + 1)
}
if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
} else {
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
}
s.prepareDicts()
for _, dict := range s.DictKeys {
sort.Strings(dict)
}
s.processDocuments()
storedIndexOffset, err := s.writeStoredFields()
if err != nil {
return 0, 0, 0, nil, err
}
var fdvIndexOffset uint64
var dictOffsets []uint64
if len(s.results) > 0 {
fdvIndexOffset, dictOffsets, err = s.writeDicts()
if err != nil {
return 0, 0, 0, nil, err
}
} else {
dictOffsets = make([]uint64, len(s.FieldsInv))
}
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
if err != nil {
return 0, 0, 0, nil, err
}
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
}
func (s *interim) getOrDefineField(fieldName string) int {
fieldIDPlus1, exists := s.FieldsMap[fieldName]
if !exists {
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
s.FieldsMap[fieldName] = fieldIDPlus1
s.FieldsInv = append(s.FieldsInv, fieldName)
s.Dicts = append(s.Dicts, make(map[string]uint64))
n := len(s.DictKeys)
if n < cap(s.DictKeys) {
s.DictKeys = s.DictKeys[:n+1]
s.DictKeys[n] = s.DictKeys[n][:0]
} else {
s.DictKeys = append(s.DictKeys, []string(nil))
}
}
return int(fieldIDPlus1 - 1)
}
// fill Dicts and DictKeys from analysis results
func (s *interim) prepareDicts() {
var pidNext int
var totTFs int
var totLocs int
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
dict := s.Dicts[fieldID]
dictKeys := s.DictKeys[fieldID]
for term, tf := range tfs {
pidPlus1, exists := dict[term]
if !exists {
pidNext++
pidPlus1 = uint64(pidNext)
dict[term] = pidPlus1
dictKeys = append(dictKeys, term)
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
}
pid := pidPlus1 - 1
s.numTermsPerPostingsList[pid] += 1
s.numLocsPerPostingsList[pid] += len(tf.Locations)
totLocs += len(tf.Locations)
}
totTFs += len(tfs)
s.DictKeys[fieldID] = dictKeys
}
for _, result := range s.results {
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name()))
_, tf := field.Analyze()
visitField(fieldID, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
tf := result.Analyzed[i]
visitField(fieldID, tf)
}
}
numPostingsLists := pidNext
if cap(s.Postings) >= numPostingsLists {
s.Postings = s.Postings[:numPostingsLists]
} else {
postings := make([]*roaring.Bitmap, numPostingsLists)
copy(postings, s.Postings[:cap(s.Postings)])
for i := 0; i < numPostingsLists; i++ {
if postings[i] == nil {
postings[i] = roaring.New()
}
}
s.Postings = postings
}
if cap(s.FreqNorms) >= numPostingsLists {
s.FreqNorms = s.FreqNorms[:numPostingsLists]
} else {
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
}
if cap(s.freqNormsBacking) >= totTFs {
s.freqNormsBacking = s.freqNormsBacking[:totTFs]
} else {
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
}
freqNormsBacking := s.freqNormsBacking
for pid, numTerms := range s.numTermsPerPostingsList {
s.FreqNorms[pid] = freqNormsBacking[0:0]
freqNormsBacking = freqNormsBacking[numTerms:]
}
if cap(s.Locs) >= numPostingsLists {
s.Locs = s.Locs[:numPostingsLists]
} else {
s.Locs = make([][]interimLoc, numPostingsLists)
}
if cap(s.locsBacking) >= totLocs {
s.locsBacking = s.locsBacking[:totLocs]
} else {
s.locsBacking = make([]interimLoc, totLocs)
}
locsBacking := s.locsBacking
for pid, numLocs := range s.numLocsPerPostingsList {
s.Locs[pid] = locsBacking[0:0]
locsBacking = locsBacking[numLocs:]
}
}
func (s *interim) processDocuments() {
numFields := len(s.FieldsInv)
reuseFieldLens := make([]int, numFields)
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
for docNum, result := range s.results {
for i := 0; i < numFields; i++ { // clear these for reuse
reuseFieldLens[i] = 0
reuseFieldTFs[i] = nil
}
s.processDocument(uint64(docNum), result,
reuseFieldLens, reuseFieldTFs)
}
}
func (s *interim) processDocument(docNum uint64,
result *index.AnalysisResult,
fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
visitField := func(fieldID uint16, fieldName string,
ln int, tf analysis.TokenFrequencies) {
fieldLens[fieldID] += ln
existingFreqs := fieldTFs[fieldID]
if existingFreqs != nil {
existingFreqs.MergeAll(fieldName, tf)
} else {
fieldTFs[fieldID] = tf
}
}
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name()))
ln, tf := field.Analyze()
visitField(fieldID, field.Name(), ln, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
ln := result.Length[i]
tf := result.Analyzed[i]
visitField(fieldID, field.Name(), ln, tf)
}
// now that it's been rolled up into fieldTFs, walk that
for fieldID, tfs := range fieldTFs {
dict := s.Dicts[fieldID]
norm := math.Float32frombits(uint32(fieldLens[fieldID]))
for term, tf := range tfs {
pid := dict[term] - 1
bs := s.Postings[pid]
bs.Add(uint32(docNum))
s.FreqNorms[pid] = append(s.FreqNorms[pid],
interimFreqNorm{
freq: uint64(tf.Frequency()),
norm: norm,
numLocs: len(tf.Locations),
})
if len(tf.Locations) > 0 {
locs := s.Locs[pid]
for _, loc := range tf.Locations {
var locf = uint16(fieldID)
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field))
}
var arrayposs []uint64
if len(loc.ArrayPositions) > 0 {
arrayposs = loc.ArrayPositions
}
locs = append(locs, interimLoc{
fieldID: locf,
pos: uint64(loc.Position),
start: uint64(loc.Start),
end: uint64(loc.End),
arrayposs: arrayposs,
})
}
s.Locs[pid] = locs
}
}
}
}
func (s *interim) writeStoredFields() (
storedIndexOffset uint64, err error) {
varBuf := make([]byte, binary.MaxVarintLen64)
metaEncode := func(val uint64) (int, error) {
wb := binary.PutUvarint(varBuf, val)
return s.metaBuf.Write(varBuf[:wb])
}
data, compressed := s.tmp0[:0], s.tmp1[:0]
defer func() { s.tmp0, s.tmp1 = data, compressed }()
// keyed by docNum
docStoredOffsets := make([]uint64, len(s.results))
// keyed by fieldID, for the current doc in the loop
docStoredFields := map[uint16]interimStoredField{}
for docNum, result := range s.results {
for fieldID := range docStoredFields { // reset for next doc
delete(docStoredFields, fieldID)
}
for _, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
opts := field.Options()
if opts.IsStored() {
isf := docStoredFields[fieldID]
isf.vals = append(isf.vals, field.Value())
isf.typs = append(isf.typs, encodeFieldType(field))
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
docStoredFields[fieldID] = isf
}
if opts.IncludeDocValues() {
s.IncludeDocValues[fieldID] = true
}
err := ValidateDocFields(field)
if err != nil {
return 0, err
}
}
var curr int
s.metaBuf.Reset()
data = data[:0]
// _id field special case optimizes ExternalID() lookups
idFieldVal := docStoredFields[uint16(0)].vals[0]
_, err = metaEncode(uint64(len(idFieldVal)))
if err != nil {
return 0, err
}
// handle non-"_id" fields
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
isf, exists := docStoredFields[uint16(fieldID)]
if exists {
curr, data, err = persistStoredFieldValues(
fieldID, isf.vals, isf.typs, isf.arrayposs,
curr, metaEncode, data)
if err != nil {
return 0, err
}
}
}
metaBytes := s.metaBuf.Bytes()
compressed = snappy.Encode(compressed[:cap(compressed)], data)
docStoredOffsets[docNum] = uint64(s.w.Count())
_, err := writeUvarints(s.w,
uint64(len(metaBytes)),
uint64(len(idFieldVal)+len(compressed)))
if err != nil {
return 0, err
}
_, err = s.w.Write(metaBytes)
if err != nil {
return 0, err
}
_, err = s.w.Write(idFieldVal)
if err != nil {
return 0, err
}
_, err = s.w.Write(compressed)
if err != nil {
return 0, err
}
}
storedIndexOffset = uint64(s.w.Count())
for _, docStoredOffset := range docStoredOffsets {
err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
if err != nil {
return 0, err
}
}
return storedIndexOffset, nil
}
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
dictOffsets = make([]uint64, len(s.FieldsInv))
fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
buf := s.grabBuf(binary.MaxVarintLen64)
// these int coders are initialized with chunk size 1024
// however this will be reset to the correct chunk size
// while processing each individual field-term section
tfEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1))
locEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1))
var docTermMap [][]byte
if s.builder == nil {
s.builder, err = vellum.New(&s.builderBuf, nil)
if err != nil {
return 0, nil, err
}
}
for fieldID, terms := range s.DictKeys {
if cap(docTermMap) < len(s.results) {
docTermMap = make([][]byte, len(s.results))
} else {
docTermMap = docTermMap[0:len(s.results)]
for docNum := range docTermMap { // reset the docTermMap
docTermMap[docNum] = docTermMap[docNum][:0]
}
}
dict := s.Dicts[fieldID]
for _, term := range terms { // terms are already sorted
pid := dict[term] - 1
postingsBS := s.Postings[pid]
freqNorms := s.FreqNorms[pid]
freqNormOffset := 0
locs := s.Locs[pid]
locOffset := 0
chunkSize, err := getChunkSize(s.chunkMode, postingsBS.GetCardinality(), uint64(len(s.results)))
if err != nil {
return 0, nil, err
}
tfEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1))
locEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1))
postingsItr := postingsBS.Iterator()
for postingsItr.HasNext() {
docNum := uint64(postingsItr.Next())
freqNorm := freqNorms[freqNormOffset]
err = tfEncoder.Add(docNum,
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
uint64(math.Float32bits(freqNorm.norm)))
if err != nil {
return 0, nil, err
}
if freqNorm.numLocs > 0 {
numBytesLocs := 0
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
numBytesLocs += totalUvarintBytes(
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
uint64(len(loc.arrayposs)), loc.arrayposs)
}
err = locEncoder.Add(docNum, uint64(numBytesLocs))
if err != nil {
return 0, nil, err
}
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
err = locEncoder.Add(docNum,
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
uint64(len(loc.arrayposs)))
if err != nil {
return 0, nil, err
}
err = locEncoder.Add(docNum, loc.arrayposs...)
if err != nil {
return 0, nil, err
}
}
locOffset += freqNorm.numLocs
}
freqNormOffset++
docTermMap[docNum] = append(
append(docTermMap[docNum], term...),
termSeparator)
}
tfEncoder.Close()
locEncoder.Close()
postingsOffset, err :=
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
if err != nil {
return 0, nil, err
}
if postingsOffset > uint64(0) {
err = s.builder.Insert([]byte(term), postingsOffset)
if err != nil {
return 0, nil, err
}
}
tfEncoder.Reset()
locEncoder.Reset()
}
err = s.builder.Close()
if err != nil {
return 0, nil, err
}
// record where this dictionary starts
dictOffsets[fieldID] = uint64(s.w.Count())
vellumData := s.builderBuf.Bytes()
// write out the length of the vellum data
n := binary.PutUvarint(buf, uint64(len(vellumData)))
_, err = s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
// write this vellum to disk
_, err = s.w.Write(vellumData)
if err != nil {
return 0, nil, err
}
// reset vellum for reuse
s.builderBuf.Reset()
err = s.builder.Reset(&s.builderBuf)
if err != nil {
return 0, nil, err
}
// write the field doc values
// NOTE: doc values continue to use legacy chunk mode
chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0)
if err != nil {
return 0, nil, err
}
fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(s.results)-1), s.w, false)
if s.IncludeDocValues[fieldID] {
for docNum, docTerms := range docTermMap {
if len(docTerms) > 0 {
err = fdvEncoder.Add(uint64(docNum), docTerms)
if err != nil {
return 0, nil, err
}
}
}
err = fdvEncoder.Close()
if err != nil {
return 0, nil, err
}
fdvOffsetsStart[fieldID] = uint64(s.w.Count())
_, err = fdvEncoder.Write()
if err != nil {
return 0, nil, err
}
fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
fdvEncoder.Reset()
} else {
fdvOffsetsStart[fieldID] = fieldNotUninverted
fdvOffsetsEnd[fieldID] = fieldNotUninverted
}
}
fdvIndexOffset = uint64(s.w.Count())
for i := 0; i < len(fdvOffsetsStart); i++ {
n := binary.PutUvarint(buf, fdvOffsetsStart[i])
_, err := s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
_, err = s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
}
return fdvIndexOffset, dictOffsets, nil
}
func encodeFieldType(f document.Field) byte {
fieldType := byte('x')
switch f.(type) {
case *document.TextField:
fieldType = 't'
case *document.NumericField:
fieldType = 'n'
case *document.DateTimeField:
fieldType = 'd'
case *document.BooleanField:
fieldType = 'b'
case *document.GeoPointField:
fieldType = 'g'
case *document.CompositeField:
fieldType = 'c'
}
return fieldType
}
// returns the total # of bytes needed to encode the given uint64's
// into binary.PutUVarint() encoding
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
n = numUvarintBytes(a)
n += numUvarintBytes(b)
n += numUvarintBytes(c)
n += numUvarintBytes(d)
n += numUvarintBytes(e)
for _, v := range more {
n += numUvarintBytes(v)
}
return n
}
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
func numUvarintBytes(x uint64) (n int) {
for x >= 0x80 {
x >>= 7
n++
}
return n + 1
}

@ -0,0 +1,37 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"github.com/blevesearch/bleve/index/scorch/segment"
)
// ZapPlugin implements the Plugin interface of
// the blevesearch/bleve/index/scorch/segment pkg
type ZapPlugin struct{}
func (*ZapPlugin) Type() string {
return Type
}
func (*ZapPlugin) Version() uint32 {
return Version
}
// Plugin returns an instance segment.Plugin for use
// by the Scorch indexing scheme
func Plugin() segment.Plugin {
return &ZapPlugin{}
}

@ -0,0 +1,801 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"encoding/binary"
"fmt"
"math"
"reflect"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizePostingsList int
var reflectStaticSizePostingsIterator int
var reflectStaticSizePosting int
var reflectStaticSizeLocation int
func init() {
var pl PostingsList
reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size())
var pi PostingsIterator
reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size())
var p Posting
reflectStaticSizePosting = int(reflect.TypeOf(p).Size())
var l Location
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
}
// FST or vellum value (uint64) encoding is determined by the top two
// highest-order or most significant bits...
//
// encoding : MSB
// name : 63 62 61...to...bit #0 (LSB)
// ----------+---+---+---------------------------------------------------
// general : 0 | 0 | 62-bits of postingsOffset.
// ~ : 0 | 1 | reserved for future.
// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum.
// ~ : 1 | 1 | reserved for future.
//
// Encoding "general" is able to handle all cases, where the
// postingsOffset points to more information about the postings for
// the term.
//
// Encoding "1-hit" is used to optimize a commonly seen case when a
// term has only a single hit. For example, a term in the _id field
// will have only 1 hit. The "1-hit" encoding is used for a term
// in a field when...
//
// - term vector info is disabled for that field;
// - and, the term appears in only a single doc for that field;
// - and, the term's freq is exactly 1 in that single doc for that field;
// - and, the docNum must fit into 31-bits;
//
// Otherwise, the "general" encoding is used instead.
//
// In the "1-hit" encoding, the field in that single doc may have
// other terms, which is supported in the "1-hit" encoding by the
// positive float31 norm.
const FSTValEncodingMask = uint64(0xc000000000000000)
const FSTValEncodingGeneral = uint64(0x0000000000000000)
const FSTValEncoding1Hit = uint64(0x8000000000000000)
func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 {
return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum)
}
func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) {
return (mask31Bits & v), (mask31Bits & (v >> 31))
}
const mask31Bits = uint64(0x000000007fffffff)
func under32Bits(x uint64) bool {
return x <= mask31Bits
}
const DocNum1HitFinished = math.MaxUint64
var NormBits1Hit = uint64(1)
// PostingsList is an in-memory representation of a postings list
type PostingsList struct {
sb *SegmentBase
postingsOffset uint64
freqOffset uint64
locOffset uint64
postings *roaring.Bitmap
except *roaring.Bitmap
// when normBits1Hit != 0, then this postings list came from a
// 1-hit encoding, and only the docNum1Hit & normBits1Hit apply
docNum1Hit uint64
normBits1Hit uint64
chunkSize uint64
}
// represents an immutable, empty postings list
var emptyPostingsList = &PostingsList{}
func (p *PostingsList) Size() int {
sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr
if p.except != nil {
sizeInBytes += int(p.except.GetSizeInBytes())
}
return sizeInBytes
}
func (p *PostingsList) OrInto(receiver *roaring.Bitmap) {
if p.normBits1Hit != 0 {
receiver.Add(uint32(p.docNum1Hit))
return
}
if p.postings != nil {
receiver.Or(p.postings)
}
}
// Iterator returns an iterator for this postings list
func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool,
prealloc segment.PostingsIterator) segment.PostingsIterator {
if p.normBits1Hit == 0 && p.postings == nil {
return emptyPostingsIterator
}
var preallocPI *PostingsIterator
pi, ok := prealloc.(*PostingsIterator)
if ok && pi != nil {
preallocPI = pi
}
if preallocPI == emptyPostingsIterator {
preallocPI = nil
}
return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI)
}
func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool,
rv *PostingsIterator) *PostingsIterator {
if rv == nil {
rv = &PostingsIterator{}
} else {
freqNormReader := rv.freqNormReader
if freqNormReader != nil {
freqNormReader.reset()
}
locReader := rv.locReader
if locReader != nil {
locReader.reset()
}
nextLocs := rv.nextLocs[:0]
nextSegmentLocs := rv.nextSegmentLocs[:0]
buf := rv.buf
*rv = PostingsIterator{} // clear the struct
rv.freqNormReader = freqNormReader
rv.locReader = locReader
rv.nextLocs = nextLocs
rv.nextSegmentLocs = nextSegmentLocs
rv.buf = buf
}
rv.postings = p
rv.includeFreqNorm = includeFreq || includeNorm || includeLocs
rv.includeLocs = includeLocs
if p.normBits1Hit != 0 {
// "1-hit" encoding
rv.docNum1Hit = p.docNum1Hit
rv.normBits1Hit = p.normBits1Hit
if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) {
rv.docNum1Hit = DocNum1HitFinished
}
return rv
}
// "general" encoding, check if empty
if p.postings == nil {
return rv
}
// initialize freq chunk reader
if rv.includeFreqNorm {
rv.freqNormReader = newChunkedIntDecoder(p.sb.mem, p.freqOffset, rv.freqNormReader)
}
// initialize the loc chunk reader
if rv.includeLocs {
rv.locReader = newChunkedIntDecoder(p.sb.mem, p.locOffset, rv.locReader)
}
rv.all = p.postings.Iterator()
if p.except != nil {
rv.ActualBM = roaring.AndNot(p.postings, p.except)
rv.Actual = rv.ActualBM.Iterator()
} else {
rv.ActualBM = p.postings
rv.Actual = rv.all // Optimize to use same iterator for all & Actual.
}
return rv
}
// Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 {
var n, e uint64
if p.normBits1Hit != 0 {
n = 1
if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) {
e = 1
}
} else if p.postings != nil {
n = p.postings.GetCardinality()
if p.except != nil {
e = p.postings.AndCardinality(p.except)
}
}
return n - e
}
func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
rv.postingsOffset = postingsOffset
// handle "1-hit" encoding special case
if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit {
return rv.init1Hit(postingsOffset)
}
// read the location of the freq/norm details
var n uint64
var read int
rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64])
n += uint64(read)
rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
var postingsLen uint64
postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
if rv.postings == nil {
rv.postings = roaring.NewBitmap()
}
_, err := rv.postings.FromBuffer(roaringBytes)
if err != nil {
return fmt.Errorf("error loading roaring bitmap: %v", err)
}
rv.chunkSize, err = getChunkSize(d.sb.chunkMode,
rv.postings.GetCardinality(), d.sb.numDocs)
if err != nil {
return err
}
return nil
}
func (rv *PostingsList) init1Hit(fstVal uint64) error {
docNum, normBits := FSTValDecode1Hit(fstVal)
rv.docNum1Hit = docNum
rv.normBits1Hit = normBits
return nil
}
// PostingsIterator provides a way to iterate through the postings list
type PostingsIterator struct {
postings *PostingsList
all roaring.IntPeekable
Actual roaring.IntPeekable
ActualBM *roaring.Bitmap
currChunk uint32
freqNormReader *chunkedIntDecoder
locReader *chunkedIntDecoder
next Posting // reused across Next() calls
nextLocs []Location // reused across Next() calls
nextSegmentLocs []segment.Location // reused across Next() calls
docNum1Hit uint64
normBits1Hit uint64
buf []byte
includeFreqNorm bool
includeLocs bool
}
var emptyPostingsIterator = &PostingsIterator{}
func (i *PostingsIterator) Size() int {
sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr +
i.next.Size()
// account for freqNormReader, locReader if we start using this.
for _, entry := range i.nextLocs {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
func (i *PostingsIterator) loadChunk(chunk int) error {
if i.includeFreqNorm {
err := i.freqNormReader.loadChunk(chunk)
if err != nil {
return err
}
}
if i.includeLocs {
err := i.locReader.loadChunk(chunk)
if err != nil {
return err
}
}
i.currChunk = uint32(chunk)
return nil
}
func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) {
if i.normBits1Hit != 0 {
return 1, i.normBits1Hit, false, nil
}
freqHasLocs, err := i.freqNormReader.readUvarint()
if err != nil {
return 0, 0, false, fmt.Errorf("error reading frequency: %v", err)
}
freq, hasLocs := decodeFreqHasLocs(freqHasLocs)
normBits, err := i.freqNormReader.readUvarint()
if err != nil {
return 0, 0, false, fmt.Errorf("error reading norm: %v", err)
}
return freq, normBits, hasLocs, nil
}
func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) {
if i.normBits1Hit != 0 {
return false, nil
}
freqHasLocs, err := i.freqNormReader.readUvarint()
if err != nil {
return false, fmt.Errorf("error reading freqHasLocs: %v", err)
}
i.freqNormReader.SkipUvarint() // Skip normBits.
return freqHasLocs&0x01 != 0, nil // See decodeFreqHasLocs() / hasLocs.
}
func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 {
rv := freq << 1
if hasLocs {
rv = rv | 0x01 // 0'th LSB encodes whether there are locations
}
return rv
}
func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) {
freq := freqHasLocs >> 1
hasLocs := freqHasLocs&0x01 != 0
return freq, hasLocs
}
// readLocation processes all the integers on the stream representing a single
// location.
func (i *PostingsIterator) readLocation(l *Location) error {
// read off field
fieldID, err := i.locReader.readUvarint()
if err != nil {
return fmt.Errorf("error reading location field: %v", err)
}
// read off pos
pos, err := i.locReader.readUvarint()
if err != nil {
return fmt.Errorf("error reading location pos: %v", err)
}
// read off start
start, err := i.locReader.readUvarint()
if err != nil {
return fmt.Errorf("error reading location start: %v", err)
}
// read off end
end, err := i.locReader.readUvarint()
if err != nil {
return fmt.Errorf("error reading location end: %v", err)
}
// read off num array pos
numArrayPos, err := i.locReader.readUvarint()
if err != nil {
return fmt.Errorf("error reading location num array pos: %v", err)
}
l.field = i.postings.sb.fieldsInv[fieldID]
l.pos = pos
l.start = start
l.end = end
if cap(l.ap) < int(numArrayPos) {
l.ap = make([]uint64, int(numArrayPos))
} else {
l.ap = l.ap[:int(numArrayPos)]
}
// read off array positions
for k := 0; k < int(numArrayPos); k++ {
ap, err := i.locReader.readUvarint()
if err != nil {
return fmt.Errorf("error reading array position: %v", err)
}
l.ap[k] = ap
}
return nil
}
// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) Next() (segment.Posting, error) {
return i.nextAtOrAfter(0)
}
// Advance returns the posting at the specified docNum or it is not present
// the next posting, or if the end is reached, nil
func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) {
return i.nextAtOrAfter(docNum)
}
// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) {
docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter)
if err != nil || !exists {
return nil, err
}
i.next = Posting{} // clear the struct
rv := &i.next
rv.docNum = docNum
if !i.includeFreqNorm {
return rv, nil
}
var normBits uint64
var hasLocs bool
rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
if err != nil {
return nil, err
}
rv.norm = math.Float32frombits(uint32(normBits))
if i.includeLocs && hasLocs {
// prepare locations into reused slices, where we assume
// rv.freq >= "number of locs", since in a composite field,
// some component fields might have their IncludeTermVector
// flags disabled while other component fields are enabled
if cap(i.nextLocs) >= int(rv.freq) {
i.nextLocs = i.nextLocs[0:rv.freq]
} else {
i.nextLocs = make([]Location, rv.freq, rv.freq*2)
}
if cap(i.nextSegmentLocs) < int(rv.freq) {
i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2)
}
rv.locs = i.nextSegmentLocs[:0]
numLocsBytes, err := i.locReader.readUvarint()
if err != nil {
return nil, fmt.Errorf("error reading location numLocsBytes: %v", err)
}
j := 0
startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader
for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) {
err := i.readLocation(&i.nextLocs[j])
if err != nil {
return nil, err
}
rv.locs = append(rv.locs, &i.nextLocs[j])
j++
}
}
return rv, nil
}
// nextDocNum returns the next docNum on the postings list, and also
// sets up the currChunk / loc related fields of the iterator.
func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) {
if i.normBits1Hit != 0 {
if i.docNum1Hit == DocNum1HitFinished {
return 0, false, nil
}
if i.docNum1Hit < atOrAfter {
// advanced past our 1-hit
i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum
return 0, false, nil
}
docNum := i.docNum1Hit
i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum
return docNum, true, nil
}
if i.Actual == nil || !i.Actual.HasNext() {
return 0, false, nil
}
if i.postings == nil || i.postings.postings == i.ActualBM {
return i.nextDocNumAtOrAfterClean(atOrAfter)
}
i.Actual.AdvanceIfNeeded(uint32(atOrAfter))
if !i.Actual.HasNext() {
// couldn't find anything
return 0, false, nil
}
n := i.Actual.Next()
allN := i.all.Next()
nChunk := n / uint32(i.postings.chunkSize)
// when allN becomes >= to here, then allN is in the same chunk as nChunk.
allNReachesNChunk := nChunk * uint32(i.postings.chunkSize)
// n is the next actual hit (excluding some postings), and
// allN is the next hit in the full postings, and
// if they don't match, move 'all' forwards until they do
for allN != n {
// we've reached same chunk, so move the freq/norm/loc decoders forward
if i.includeFreqNorm && allN >= allNReachesNChunk {
err := i.currChunkNext(nChunk)
if err != nil {
return 0, false, err
}
}
allN = i.all.Next()
}
if i.includeFreqNorm && (i.currChunk != nChunk || i.freqNormReader.isNil()) {
err := i.loadChunk(int(nChunk))
if err != nil {
return 0, false, fmt.Errorf("error loading chunk: %v", err)
}
}
return uint64(n), true, nil
}
// optimization when the postings list is "clean" (e.g., no updates &
// no deletions) where the all bitmap is the same as the actual bitmap
func (i *PostingsIterator) nextDocNumAtOrAfterClean(
atOrAfter uint64) (uint64, bool, error) {
if !i.includeFreqNorm {
i.Actual.AdvanceIfNeeded(uint32(atOrAfter))
if !i.Actual.HasNext() {
return 0, false, nil // couldn't find anything
}
return uint64(i.Actual.Next()), true, nil
}
// freq-norm's needed, so maintain freq-norm chunk reader
sameChunkNexts := 0 // # of times we called Next() in the same chunk
n := i.Actual.Next()
nChunk := n / uint32(i.postings.chunkSize)
for uint64(n) < atOrAfter && i.Actual.HasNext() {
n = i.Actual.Next()
nChunkPrev := nChunk
nChunk = n / uint32(i.postings.chunkSize)
if nChunk != nChunkPrev {
sameChunkNexts = 0
} else {
sameChunkNexts += 1
}
}
if uint64(n) < atOrAfter {
// couldn't find anything
return 0, false, nil
}
for j := 0; j < sameChunkNexts; j++ {
err := i.currChunkNext(nChunk)
if err != nil {
return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err)
}
}
if i.currChunk != nChunk || i.freqNormReader.isNil() {
err := i.loadChunk(int(nChunk))
if err != nil {
return 0, false, fmt.Errorf("error loading chunk: %v", err)
}
}
return uint64(n), true, nil
}
func (i *PostingsIterator) currChunkNext(nChunk uint32) error {
if i.currChunk != nChunk || i.freqNormReader.isNil() {
err := i.loadChunk(int(nChunk))
if err != nil {
return fmt.Errorf("error loading chunk: %v", err)
}
}
// read off freq/offsets even though we don't care about them
hasLocs, err := i.skipFreqNormReadHasLocs()
if err != nil {
return err
}
if i.includeLocs && hasLocs {
numLocsBytes, err := i.locReader.readUvarint()
if err != nil {
return fmt.Errorf("error reading location numLocsBytes: %v", err)
}
// skip over all the location bytes
i.locReader.SkipBytes(int(numLocsBytes))
}
return nil
}
// DocNum1Hit returns the docNum and true if this is "1-hit" optimized
// and the docNum is available.
func (p *PostingsIterator) DocNum1Hit() (uint64, bool) {
if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished {
return p.docNum1Hit, true
}
return 0, false
}
// ActualBitmap returns the underlying actual bitmap
// which can be used up the stack for optimizations
func (p *PostingsIterator) ActualBitmap() *roaring.Bitmap {
return p.ActualBM
}
// ReplaceActual replaces the ActualBM with the provided
// bitmap
func (p *PostingsIterator) ReplaceActual(abm *roaring.Bitmap) {
p.ActualBM = abm
p.Actual = abm.Iterator()
}
// PostingsIteratorFromBitmap constructs a PostingsIterator given an
// "actual" bitmap.
func PostingsIteratorFromBitmap(bm *roaring.Bitmap,
includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) {
return &PostingsIterator{
ActualBM: bm,
Actual: bm.Iterator(),
includeFreqNorm: includeFreqNorm,
includeLocs: includeLocs,
}, nil
}
// PostingsIteratorFrom1Hit constructs a PostingsIterator given a
// 1-hit docNum.
func PostingsIteratorFrom1Hit(docNum1Hit uint64,
includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) {
return &PostingsIterator{
docNum1Hit: docNum1Hit,
normBits1Hit: NormBits1Hit,
includeFreqNorm: includeFreqNorm,
includeLocs: includeLocs,
}, nil
}
// Posting is a single entry in a postings list
type Posting struct {
docNum uint64
freq uint64
norm float32
locs []segment.Location
}
func (p *Posting) Size() int {
sizeInBytes := reflectStaticSizePosting
for _, entry := range p.locs {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
// Number returns the document number of this posting in this segment
func (p *Posting) Number() uint64 {
return p.docNum
}
// Frequency returns the frequencies of occurrence of this term in this doc/field
func (p *Posting) Frequency() uint64 {
return p.freq
}
// Norm returns the normalization factor for this posting
func (p *Posting) Norm() float64 {
return float64(float32(1.0 / math.Sqrt(float64(math.Float32bits(p.norm)))))
}
// Locations returns the location information for each occurrence
func (p *Posting) Locations() []segment.Location {
return p.locs
}
// NormUint64 returns the norm value as uint64
func (p *Posting) NormUint64() uint64 {
return uint64(math.Float32bits(p.norm))
}
// Location represents the location of a single occurrence
type Location struct {
field string
pos uint64
start uint64
end uint64
ap []uint64
}
func (l *Location) Size() int {
return reflectStaticSizeLocation +
len(l.field) +
len(l.ap)*size.SizeOfUint64
}
// Field returns the name of the field (useful in composite fields to know
// which original field the value came from)
func (l *Location) Field() string {
return l.field
}
// Start returns the start byte offset of this occurrence
func (l *Location) Start() uint64 {
return l.start
}
// End returns the end byte offset of this occurrence
func (l *Location) End() uint64 {
return l.end
}
// Pos returns the 1-based phrase position of this occurrence
func (l *Location) Pos() uint64 {
return l.pos
}
// ArrayPositions returns the array position vector associated with this occurrence
func (l *Location) ArrayPositions() []uint64 {
return l.ap
}

@ -0,0 +1,43 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import "encoding/binary"
func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) {
_, storedOffset, n, metaLen, dataLen := s.getDocStoredOffsets(docNum)
meta := s.mem[storedOffset+n : storedOffset+n+metaLen]
data := s.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen]
return meta, data
}
func (s *SegmentBase) getDocStoredOffsets(docNum uint64) (
uint64, uint64, uint64, uint64, uint64) {
indexOffset := s.storedIndexOffset + (8 * docNum)
storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8])
var n uint64
metaLen, read := binary.Uvarint(s.mem[storedOffset : storedOffset+binary.MaxVarintLen64])
n += uint64(read)
dataLen, read := binary.Uvarint(s.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64])
n += uint64(read)
return indexOffset, storedOffset, n, metaLen, dataLen
}

@ -0,0 +1,572 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"os"
"sync"
"unsafe"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
"github.com/couchbase/vellum"
mmap "github.com/blevesearch/mmap-go"
"github.com/golang/snappy"
)
var reflectStaticSizeSegmentBase int
func init() {
var sb SegmentBase
reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb))
}
// Open returns a zap impl of a segment
func (*ZapPlugin) Open(path string) (segment.Segment, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
mm, err := mmap.Map(f, mmap.RDONLY, 0)
if err != nil {
// mmap failed, try to close the file
_ = f.Close()
return nil, err
}
rv := &Segment{
SegmentBase: SegmentBase{
mem: mm[0 : len(mm)-FooterSize],
fieldsMap: make(map[string]uint16),
fieldDvReaders: make(map[uint16]*docValueReader),
fieldFSTs: make(map[uint16]*vellum.FST),
},
f: f,
mm: mm,
path: path,
refs: 1,
}
rv.SegmentBase.updateSize()
err = rv.loadConfig()
if err != nil {
_ = rv.Close()
return nil, err
}
err = rv.loadFields()
if err != nil {
_ = rv.Close()
return nil, err
}
err = rv.loadDvReaders()
if err != nil {
_ = rv.Close()
return nil, err
}
return rv, nil
}
// SegmentBase is a memory only, read-only implementation of the
// segment.Segment interface, using zap's data representation.
type SegmentBase struct {
mem []byte
memCRC uint32
chunkMode uint32
fieldsMap map[string]uint16 // fieldName -> fieldID+1
fieldsInv []string // fieldID -> fieldName
numDocs uint64
storedIndexOffset uint64
fieldsIndexOffset uint64
docValueOffset uint64
dictLocs []uint64
fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field
fieldDvNames []string // field names cached in fieldDvReaders
size uint64
m sync.Mutex
fieldFSTs map[uint16]*vellum.FST
}
func (sb *SegmentBase) Size() int {
return int(sb.size)
}
func (sb *SegmentBase) updateSize() {
sizeInBytes := reflectStaticSizeSegmentBase +
cap(sb.mem)
// fieldsMap
for k := range sb.fieldsMap {
sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16
}
// fieldsInv, dictLocs
for _, entry := range sb.fieldsInv {
sizeInBytes += len(entry) + size.SizeOfString
}
sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64
// fieldDvReaders
for _, v := range sb.fieldDvReaders {
sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr
if v != nil {
sizeInBytes += v.size()
}
}
sb.size = uint64(sizeInBytes)
}
func (sb *SegmentBase) AddRef() {}
func (sb *SegmentBase) DecRef() (err error) { return nil }
func (sb *SegmentBase) Close() (err error) { return nil }
// Segment implements a persisted segment.Segment interface, by
// embedding an mmap()'ed SegmentBase.
type Segment struct {
SegmentBase
f *os.File
mm mmap.MMap
path string
version uint32
crc uint32
m sync.Mutex // Protects the fields that follow.
refs int64
}
func (s *Segment) Size() int {
// 8 /* size of file pointer */
// 4 /* size of version -> uint32 */
// 4 /* size of crc -> uint32 */
sizeOfUints := 16
sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints
// mutex, refs -> int64
sizeInBytes += 16
// do not include the mmap'ed part
return sizeInBytes + s.SegmentBase.Size() - cap(s.mem)
}
func (s *Segment) AddRef() {
s.m.Lock()
s.refs++
s.m.Unlock()
}
func (s *Segment) DecRef() (err error) {
s.m.Lock()
s.refs--
if s.refs == 0 {
err = s.closeActual()
}
s.m.Unlock()
return err
}
func (s *Segment) loadConfig() error {
crcOffset := len(s.mm) - 4
s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4])
verOffset := crcOffset - 4
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
if s.version != Version {
return fmt.Errorf("unsupported version %d", s.version)
}
chunkOffset := verOffset - 4
s.chunkMode = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4])
docValueOffset := chunkOffset - 8
s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8])
fieldsIndexOffset := docValueOffset - 8
s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8])
storedIndexOffset := fieldsIndexOffset - 8
s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8])
numDocsOffset := storedIndexOffset - 8
s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8])
return nil
}
func (s *SegmentBase) loadFields() error {
// NOTE for now we assume the fields index immediately precedes
// the footer, and if this changes, need to adjust accordingly (or
// store explicit length), where s.mem was sliced from s.mm in Open().
fieldsIndexEnd := uint64(len(s.mem))
// iterate through fields index
var fieldID uint64
for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd {
addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8])
dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd])
n := uint64(read)
s.dictLocs = append(s.dictLocs, dictLoc)
var nameLen uint64
nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd])
n += uint64(read)
name := string(s.mem[addr+n : addr+n+nameLen])
s.fieldsInv = append(s.fieldsInv, name)
s.fieldsMap[name] = uint16(fieldID + 1)
fieldID++
}
return nil
}
// Dictionary returns the term dictionary for the specified field
func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) {
dict, err := s.dictionary(field)
if err == nil && dict == nil {
return &segment.EmptyDictionary{}, nil
}
return dict, err
}
func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
fieldIDPlus1 := sb.fieldsMap[field]
if fieldIDPlus1 > 0 {
rv = &Dictionary{
sb: sb,
field: field,
fieldID: fieldIDPlus1 - 1,
}
dictStart := sb.dictLocs[rv.fieldID]
if dictStart > 0 {
var ok bool
sb.m.Lock()
if rv.fst, ok = sb.fieldFSTs[rv.fieldID]; !ok {
// read the length of the vellum data
vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64])
fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen]
rv.fst, err = vellum.Load(fstBytes)
if err != nil {
sb.m.Unlock()
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
}
sb.fieldFSTs[rv.fieldID] = rv.fst
}
sb.m.Unlock()
rv.fstReader, err = rv.fst.Reader()
if err != nil {
return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err)
}
}
}
return rv, nil
}
// visitDocumentCtx holds data structures that are reusable across
// multiple VisitDocument() calls to avoid memory allocations
type visitDocumentCtx struct {
buf []byte
reader bytes.Reader
arrayPos []uint64
}
var visitDocumentCtxPool = sync.Pool{
New: func() interface{} {
reuse := &visitDocumentCtx{}
return reuse
},
}
// VisitDocument invokes the DocFieldValueVistor for each stored field
// for the specified doc number
func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
defer visitDocumentCtxPool.Put(vdc)
return s.visitDocument(vdc, num, visitor)
}
func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64,
visitor segment.DocumentFieldValueVisitor) error {
// first make sure this is a valid number in this segment
if num < s.numDocs {
meta, compressed := s.getDocStoredMetaAndCompressed(num)
vdc.reader.Reset(meta)
// handle _id field special case
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
idFieldVal := compressed[:idFieldValLen]
keepGoing := visitor("_id", byte('t'), idFieldVal, nil)
if !keepGoing {
visitDocumentCtxPool.Put(vdc)
return nil
}
// handle non-"_id" fields
compressed = compressed[idFieldValLen:]
uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed)
if err != nil {
return err
}
for keepGoing {
field, err := binary.ReadUvarint(&vdc.reader)
if err == io.EOF {
break
}
if err != nil {
return err
}
typ, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
offset, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
l, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
numap, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
var arrayPos []uint64
if numap > 0 {
if cap(vdc.arrayPos) < int(numap) {
vdc.arrayPos = make([]uint64, numap)
}
arrayPos = vdc.arrayPos[:numap]
for i := 0; i < int(numap); i++ {
ap, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
arrayPos[i] = ap
}
}
value := uncompressed[offset : offset+l]
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
}
vdc.buf = uncompressed
}
return nil
}
// DocID returns the value of the _id field for the given docNum
func (s *SegmentBase) DocID(num uint64) ([]byte, error) {
if num >= s.numDocs {
return nil, nil
}
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
meta, compressed := s.getDocStoredMetaAndCompressed(num)
vdc.reader.Reset(meta)
// handle _id field special case
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return nil, err
}
idFieldVal := compressed[:idFieldValLen]
visitDocumentCtxPool.Put(vdc)
return idFieldVal, nil
}
// Count returns the number of documents in this segment.
func (s *SegmentBase) Count() uint64 {
return s.numDocs
}
// DocNumbers returns a bitset corresponding to the doc numbers of all the
// provided _id strings
func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
rv := roaring.New()
if len(s.fieldsMap) > 0 {
idDict, err := s.dictionary("_id")
if err != nil {
return nil, err
}
postingsList := emptyPostingsList
sMax, err := idDict.fst.GetMaxKey()
if err != nil {
return nil, err
}
sMaxStr := string(sMax)
filteredIds := make([]string, 0, len(ids))
for _, id := range ids {
if id <= sMaxStr {
filteredIds = append(filteredIds, id)
}
}
for _, id := range filteredIds {
postingsList, err = idDict.postingsList([]byte(id), nil, postingsList)
if err != nil {
return nil, err
}
postingsList.OrInto(rv)
}
}
return rv, nil
}
// Fields returns the field names used in this segment
func (s *SegmentBase) Fields() []string {
return s.fieldsInv
}
// Path returns the path of this segment on disk
func (s *Segment) Path() string {
return s.path
}
// Close releases all resources associated with this segment
func (s *Segment) Close() (err error) {
return s.DecRef()
}
func (s *Segment) closeActual() (err error) {
if s.mm != nil {
err = s.mm.Unmap()
}
// try to close file even if unmap failed
if s.f != nil {
err2 := s.f.Close()
if err == nil {
// try to return first error
err = err2
}
}
return
}
// some helpers i started adding for the command-line utility
// Data returns the underlying mmaped data slice
func (s *Segment) Data() []byte {
return s.mm
}
// CRC returns the CRC value stored in the file footer
func (s *Segment) CRC() uint32 {
return s.crc
}
// Version returns the file version in the file footer
func (s *Segment) Version() uint32 {
return s.version
}
// ChunkFactor returns the chunk factor in the file footer
func (s *Segment) ChunkMode() uint32 {
return s.chunkMode
}
// FieldsIndexOffset returns the fields index offset in the file footer
func (s *Segment) FieldsIndexOffset() uint64 {
return s.fieldsIndexOffset
}
// StoredIndexOffset returns the stored value index offset in the file footer
func (s *Segment) StoredIndexOffset() uint64 {
return s.storedIndexOffset
}
// DocValueOffset returns the docValue offset in the file footer
func (s *Segment) DocValueOffset() uint64 {
return s.docValueOffset
}
// NumDocs returns the number of documents in the file footer
func (s *Segment) NumDocs() uint64 {
return s.numDocs
}
// DictAddr is a helper function to compute the file offset where the
// dictionary is stored for the specified field.
func (s *Segment) DictAddr(field string) (uint64, error) {
fieldIDPlus1, ok := s.fieldsMap[field]
if !ok {
return 0, fmt.Errorf("no such field '%s'", field)
}
return s.dictLocs[fieldIDPlus1-1], nil
}
func (s *SegmentBase) loadDvReaders() error {
if s.docValueOffset == fieldNotUninverted || s.numDocs == 0 {
return nil
}
var read uint64
for fieldID, field := range s.fieldsInv {
var fieldLocStart, fieldLocEnd uint64
var n int
fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
if n <= 0 {
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID)
}
read += uint64(n)
fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
if n <= 0 {
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID)
}
read += uint64(n)
fieldDvReader, err := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd)
if err != nil {
return err
}
if fieldDvReader != nil {
s.fieldDvReaders[uint16(fieldID)] = fieldDvReader
s.fieldDvNames = append(s.fieldDvNames, field)
}
}
return nil
}

@ -0,0 +1,145 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"encoding/binary"
"io"
"github.com/RoaringBitmap/roaring"
)
// writes out the length of the roaring bitmap in bytes as varint
// then writes out the roaring bitmap itself
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
reuseBufVarint []byte) (int, error) {
buf, err := r.ToBytes()
if err != nil {
return 0, err
}
var tw int
// write out the length
n := binary.PutUvarint(reuseBufVarint, uint64(len(buf)))
nw, err := w.Write(reuseBufVarint[:n])
tw += nw
if err != nil {
return tw, err
}
// write out the roaring bytes
nw, err = w.Write(buf)
tw += nw
if err != nil {
return tw, err
}
return tw, nil
}
func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) {
var rv uint64
var fieldsOffsets []uint64
for fieldID, fieldName := range fieldsInv {
// record start of this field
fieldsOffsets = append(fieldsOffsets, uint64(w.Count()))
// write out the dict location and field name length
_, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName)))
if err != nil {
return 0, err
}
// write out the field name
_, err = w.Write([]byte(fieldName))
if err != nil {
return 0, err
}
}
// now write out the fields index
rv = uint64(w.Count())
for fieldID := range fieldsInv {
err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID])
if err != nil {
return 0, err
}
}
return rv, nil
}
// FooterSize is the size of the footer record in bytes
// crc + ver + chunk + field offset + stored offset + num docs + docValueOffset
const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8
func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
chunkMode uint32, crcBeforeFooter uint32, writerIn io.Writer) error {
w := NewCountHashWriter(writerIn)
w.crc = crcBeforeFooter
// write out the number of docs
err := binary.Write(w, binary.BigEndian, numDocs)
if err != nil {
return err
}
// write out the stored field index location:
err = binary.Write(w, binary.BigEndian, storedIndexOffset)
if err != nil {
return err
}
// write out the field index location
err = binary.Write(w, binary.BigEndian, fieldsIndexOffset)
if err != nil {
return err
}
// write out the fieldDocValue location
err = binary.Write(w, binary.BigEndian, docValueOffset)
if err != nil {
return err
}
// write out 32-bit chunk factor
err = binary.Write(w, binary.BigEndian, chunkMode)
if err != nil {
return err
}
// write out 32-bit version
err = binary.Write(w, binary.BigEndian, Version)
if err != nil {
return err
}
// write out CRC-32 of everything upto but not including this CRC
err = binary.Write(w, binary.BigEndian, w.crc)
if err != nil {
return err
}
return nil
}
func writeUvarints(w io.Writer, vals ...uint64) (tw int, err error) {
buf := make([]byte, binary.MaxVarintLen64)
for _, val := range vals {
n := binary.PutUvarint(buf, val)
var nw int
nw, err = w.Write(buf[:n])
tw += nw
if err != nil {
return tw, err
}
}
return tw, err
}

@ -0,0 +1,177 @@
# ZAP File Format
## Legend
### Sections
|========|
| | section
|========|
### Fixed-size fields
|--------| |----| |--| |-|
| | uint64 | | uint32 | | uint16 | | uint8
|--------| |----| |--| |-|
### Varints
|~~~~~~~~|
| | varint(up to uint64)
|~~~~~~~~|
### Arbitrary-length fields
|--------...---|
| | arbitrary-length field (string, vellum, roaring bitmap)
|--------...---|
### Chunked data
[--------]
[ ]
[--------]
## Overview
Footer section describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing.
|==================================================|
| Stored Fields |
|==================================================|
|-----> | Stored Fields Index |
| |==================================================|
| | Dictionaries + Postings + DocValues |
| |==================================================|
| |---> | DocValues Index |
| | |==================================================|
| | | Fields |
| | |==================================================|
| | |-> | Fields Index |
| | | |========|========|========|========|====|====|====|
| | | | D# | SF | F | FDV | CF | V | CC | (Footer)
| | | |========|====|===|====|===|====|===|====|====|====|
| | | | | |
|-+-+-----------------| | |
| |--------------------------| |
|-------------------------------------|
D#. Number of Docs.
SF. Stored Fields Index Offset.
F. Field Index Offset.
FDV. Field DocValue Offset.
CF. Chunk Factor.
V. Version.
CC. CRC32.
## Stored Fields
Stored Fields Index is `D#` consecutive 64-bit unsigned integers - offsets, where relevant Stored Fields Data records are located.
0 [SF] [SF + D# * 8]
| Stored Fields | Stored Fields Index |
|================================|==================================|
| | |
| |--------------------| ||--------|--------|. . .|--------||
| |-> | Stored Fields Data | || 0 | 1 | | D# - 1 ||
| | |--------------------| ||--------|----|---|. . .|--------||
| | | | |
|===|============================|==============|===================|
| |
|-------------------------------------------|
Stored Fields Data is an arbitrary size record, which consists of metadata and [Snappy](https://github.com/golang/snappy)-compressed data.
Stored Fields Data
|~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~|
| MDS | CDS | MD | CD |
|~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~|
MDS. Metadata size.
CDS. Compressed data size.
MD. Metadata.
CD. Snappy-compressed data.
## Fields
Fields Index section located between addresses `F` and `len(file) - len(footer)` and consist of `uint64` values (`F1`, `F2`, ...) which are offsets to records in Fields section. We have `F# = (len(file) - len(footer) - F) / sizeof(uint64)` fields.
(...) [F] [F + F#]
| Fields | Fields Index. |
|================================|================================|
| | |
| |~~~~~~~~|~~~~~~~~|---...---|||--------|--------|...|--------||
||->| Dict | Length | Name ||| 0 | 1 | | F# - 1 ||
|| |~~~~~~~~|~~~~~~~~|---...---|||--------|----|---|...|--------||
|| | | |
||===============================|==============|=================|
| |
|----------------------------------------------|
## Dictionaries + Postings
Each of fields has its own dictionary, encoded in [Vellum](https://github.com/couchbase/vellum) format. Dictionary consists of pairs `(term, offset)`, where `offset` indicates the position of postings (list of documents) for this particular term.
|================================================================|- Dictionaries +
| | Postings +
| | DocValues
| Freq/Norm (chunked) |
| [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] |
| |->[ Freq | Norm (float32 under varint) ] |
| | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] |
| | |
| |------------------------------------------------------------| |
| Location Details (chunked) | |
| [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | |
| |->[ Size | Pos | Start | End | Arr# | ArrPos | ... ] | |
| | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | |
| | | |
| |----------------------| | |
| Postings List | | |
| |~~~~~~~~|~~~~~|~~|~~~~~~~~|-----------...--| | |
| |->| F/N | LD | Length | ROARING BITMAP | | |
| | |~~~~~|~~|~~~~~~~~|~~~~~~~~|-----------...--| | |
| | |----------------------------------------------| |
| |--------------------------------------| |
| Dictionary | |
| |~~~~~~~~|--------------------------|-...-| |
| |->| Length | VELLUM DATA : (TERM -> OFFSET) | |
| | |~~~~~~~~|----------------------------...-| |
| | |
|======|=========================================================|- DocValues Index
| | |
|======|=========================================================|- Fields
| | |
| |~~~~|~~~|~~~~~~~~|---...---| |
| | Dict | Length | Name | |
| |~~~~~~~~|~~~~~~~~|---...---| |
| |
|================================================================|
## DocValues
DocValues Index is `F#` pairs of varints, one pair per field. Each pair of varints indicates start and end point of DocValues slice.
|================================================================|
| |------...--| |
| |->| DocValues |<-| |
| | |------...--| | |
|==|=================|===========================================|- DocValues Index
||~|~~~~~~~~~|~~~~~~~|~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~||
|| DV1 START | DV1 STOP | . . . . . | DV(F#) START | DV(F#) END ||
||~~~~~~~~~~~|~~~~~~~~~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~||
|================================================================|
DocValues is chunked Snappy-compressed values for each document and field.
[~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-]
[ Doc# in Chunk | Doc1 | Offset1 | ... | DocN | OffsetN | SNAPPY COMPRESSED DATA ]
[~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-]
Last 16 bytes are description of chunks.
|~~~~~~~~~~~~...~|----------------|----------------|
| Chunk Sizes | Chunk Size Arr | Chunk# |
|~~~~~~~~~~~~...~|----------------|----------------|

@ -0,0 +1,8 @@
language: go
go:
- "1.x"
- master
env:
- TAGS=""
- TAGS="-tags purego"
script: go test $TAGS -v ./...

@ -0,0 +1,22 @@
Copyright (c) 2016 Caleb Spare
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

@ -0,0 +1,67 @@
# xxhash
[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
xxhash is a Go implementation of the 64-bit
[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
high-quality hashing algorithm that is much faster than anything in the Go
standard library.
This package provides a straightforward API:
```
func Sum64(b []byte) uint64
func Sum64String(s string) uint64
type Digest struct{ ... }
func New() *Digest
```
The `Digest` type implements hash.Hash64. Its key methods are:
```
func (*Digest) Write([]byte) (int, error)
func (*Digest) WriteString(string) (int, error)
func (*Digest) Sum64() uint64
```
This implementation provides a fast pure-Go implementation and an even faster
assembly implementation for amd64.
## Compatibility
This package is in a module and the latest code is in version 2 of the module.
You need a version of Go with at least "minimal module compatibility" to use
github.com/cespare/xxhash/v2:
* 1.9.7+ for Go 1.9
* 1.10.3+ for Go 1.10
* Go 1.11 or later
I recommend using the latest release of Go.
## Benchmarks
Here are some quick benchmarks comparing the pure-Go and assembly
implementations of Sum64.
| input size | purego | asm |
| --- | --- | --- |
| 5 B | 979.66 MB/s | 1291.17 MB/s |
| 100 B | 7475.26 MB/s | 7973.40 MB/s |
| 4 KB | 17573.46 MB/s | 17602.65 MB/s |
| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
the following commands under Go 1.11.2:
```
$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
```
## Projects using this package
- [InfluxDB](https://github.com/influxdata/influxdb)
- [Prometheus](https://github.com/prometheus/prometheus)
- [FreeCache](https://github.com/coocood/freecache)

@ -0,0 +1,3 @@
module github.com/cespare/xxhash/v2
go 1.11

@ -0,0 +1,236 @@
// Package xxhash implements the 64-bit variant of xxHash (XXH64) as described
// at http://cyan4973.github.io/xxHash/.
package xxhash
import (
"encoding/binary"
"errors"
"math/bits"
)
const (
prime1 uint64 = 11400714785074694791
prime2 uint64 = 14029467366897019727
prime3 uint64 = 1609587929392839161
prime4 uint64 = 9650029242287828579
prime5 uint64 = 2870177450012600261
)
// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
// possible in the Go code is worth a small (but measurable) performance boost
// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
// convenience in the Go code in a few places where we need to intentionally
// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
// result overflows a uint64).
var (
prime1v = prime1
prime2v = prime2
prime3v = prime3
prime4v = prime4
prime5v = prime5
)
// Digest implements hash.Hash64.
type Digest struct {
v1 uint64
v2 uint64
v3 uint64
v4 uint64
total uint64
mem [32]byte
n int // how much of mem is used
}
// New creates a new Digest that computes the 64-bit xxHash algorithm.
func New() *Digest {
var d Digest
d.Reset()
return &d
}
// Reset clears the Digest's state so that it can be reused.
func (d *Digest) Reset() {
d.v1 = prime1v + prime2
d.v2 = prime2
d.v3 = 0
d.v4 = -prime1v
d.total = 0
d.n = 0
}
// Size always returns 8 bytes.
func (d *Digest) Size() int { return 8 }
// BlockSize always returns 32 bytes.
func (d *Digest) BlockSize() int { return 32 }
// Write adds more data to d. It always returns len(b), nil.
func (d *Digest) Write(b []byte) (n int, err error) {
n = len(b)
d.total += uint64(n)
if d.n+n < 32 {
// This new data doesn't even fill the current block.
copy(d.mem[d.n:], b)
d.n += n
return
}
if d.n > 0 {
// Finish off the partial block.
copy(d.mem[d.n:], b)
d.v1 = round(d.v1, u64(d.mem[0:8]))
d.v2 = round(d.v2, u64(d.mem[8:16]))
d.v3 = round(d.v3, u64(d.mem[16:24]))
d.v4 = round(d.v4, u64(d.mem[24:32]))
b = b[32-d.n:]
d.n = 0
}
if len(b) >= 32 {
// One or more full blocks left.
nw := writeBlocks(d, b)
b = b[nw:]
}
// Store any remaining partial block.
copy(d.mem[:], b)
d.n = len(b)
return
}
// Sum appends the current hash to b and returns the resulting slice.
func (d *Digest) Sum(b []byte) []byte {
s := d.Sum64()
return append(
b,
byte(s>>56),
byte(s>>48),
byte(s>>40),
byte(s>>32),
byte(s>>24),
byte(s>>16),
byte(s>>8),
byte(s),
)
}
// Sum64 returns the current hash.
func (d *Digest) Sum64() uint64 {
var h uint64
if d.total >= 32 {
v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
h = mergeRound(h, v1)
h = mergeRound(h, v2)
h = mergeRound(h, v3)
h = mergeRound(h, v4)
} else {
h = d.v3 + prime5
}
h += d.total
i, end := 0, d.n
for ; i+8 <= end; i += 8 {
k1 := round(0, u64(d.mem[i:i+8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
if i+4 <= end {
h ^= uint64(u32(d.mem[i:i+4])) * prime1
h = rol23(h)*prime2 + prime3
i += 4
}
for i < end {
h ^= uint64(d.mem[i]) * prime5
h = rol11(h) * prime1
i++
}
h ^= h >> 33
h *= prime2
h ^= h >> 29
h *= prime3
h ^= h >> 32
return h
}
const (
magic = "xxh\x06"
marshaledSize = len(magic) + 8*5 + 32
)
// MarshalBinary implements the encoding.BinaryMarshaler interface.
func (d *Digest) MarshalBinary() ([]byte, error) {
b := make([]byte, 0, marshaledSize)
b = append(b, magic...)
b = appendUint64(b, d.v1)
b = appendUint64(b, d.v2)
b = appendUint64(b, d.v3)
b = appendUint64(b, d.v4)
b = appendUint64(b, d.total)
b = append(b, d.mem[:d.n]...)
b = b[:len(b)+len(d.mem)-d.n]
return b, nil
}
// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
func (d *Digest) UnmarshalBinary(b []byte) error {
if len(b) < len(magic) || string(b[:len(magic)]) != magic {
return errors.New("xxhash: invalid hash state identifier")
}
if len(b) != marshaledSize {
return errors.New("xxhash: invalid hash state size")
}
b = b[len(magic):]
b, d.v1 = consumeUint64(b)
b, d.v2 = consumeUint64(b)
b, d.v3 = consumeUint64(b)
b, d.v4 = consumeUint64(b)
b, d.total = consumeUint64(b)
copy(d.mem[:], b)
b = b[len(d.mem):]
d.n = int(d.total % uint64(len(d.mem)))
return nil
}
func appendUint64(b []byte, x uint64) []byte {
var a [8]byte
binary.LittleEndian.PutUint64(a[:], x)
return append(b, a[:]...)
}
func consumeUint64(b []byte) ([]byte, uint64) {
x := u64(b)
return b[8:], x
}
func u64(b []byte) uint64 { return binary.LittleEndian.Uint64(b) }
func u32(b []byte) uint32 { return binary.LittleEndian.Uint32(b) }
func round(acc, input uint64) uint64 {
acc += input * prime2
acc = rol31(acc)
acc *= prime1
return acc
}
func mergeRound(acc, val uint64) uint64 {
val = round(0, val)
acc ^= val
acc = acc*prime1 + prime4
return acc
}
func rol1(x uint64) uint64 { return bits.RotateLeft64(x, 1) }
func rol7(x uint64) uint64 { return bits.RotateLeft64(x, 7) }
func rol11(x uint64) uint64 { return bits.RotateLeft64(x, 11) }
func rol12(x uint64) uint64 { return bits.RotateLeft64(x, 12) }
func rol18(x uint64) uint64 { return bits.RotateLeft64(x, 18) }
func rol23(x uint64) uint64 { return bits.RotateLeft64(x, 23) }
func rol27(x uint64) uint64 { return bits.RotateLeft64(x, 27) }
func rol31(x uint64) uint64 { return bits.RotateLeft64(x, 31) }

@ -0,0 +1,13 @@
// +build !appengine
// +build gc
// +build !purego
package xxhash
// Sum64 computes the 64-bit xxHash digest of b.
//
//go:noescape
func Sum64(b []byte) uint64
//go:noescape
func writeBlocks(d *Digest, b []byte) int

@ -0,0 +1,215 @@
// +build !appengine
// +build gc
// +build !purego
#include "textflag.h"
// Register allocation:
// AX h
// CX pointer to advance through b
// DX n
// BX loop end
// R8 v1, k1
// R9 v2
// R10 v3
// R11 v4
// R12 tmp
// R13 prime1v
// R14 prime2v
// R15 prime4v
// round reads from and advances the buffer pointer in CX.
// It assumes that R13 has prime1v and R14 has prime2v.
#define round(r) \
MOVQ (CX), R12 \
ADDQ $8, CX \
IMULQ R14, R12 \
ADDQ R12, r \
ROLQ $31, r \
IMULQ R13, r
// mergeRound applies a merge round on the two registers acc and val.
// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
#define mergeRound(acc, val) \
IMULQ R14, val \
ROLQ $31, val \
IMULQ R13, val \
XORQ val, acc \
IMULQ R13, acc \
ADDQ R15, acc
// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT, $0-32
// Load fixed primes.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
MOVQ ·prime4v(SB), R15
// Load slice.
MOVQ b_base+0(FP), CX
MOVQ b_len+8(FP), DX
LEAQ (CX)(DX*1), BX
// The first loop limit will be len(b)-32.
SUBQ $32, BX
// Check whether we have at least one block.
CMPQ DX, $32
JLT noBlocks
// Set up initial state (v1, v2, v3, v4).
MOVQ R13, R8
ADDQ R14, R8
MOVQ R14, R9
XORQ R10, R10
XORQ R11, R11
SUBQ R13, R11
// Loop until CX > BX.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
CMPQ CX, BX
JLE blockLoop
MOVQ R8, AX
ROLQ $1, AX
MOVQ R9, R12
ROLQ $7, R12
ADDQ R12, AX
MOVQ R10, R12
ROLQ $12, R12
ADDQ R12, AX
MOVQ R11, R12
ROLQ $18, R12
ADDQ R12, AX
mergeRound(AX, R8)
mergeRound(AX, R9)
mergeRound(AX, R10)
mergeRound(AX, R11)
JMP afterBlocks
noBlocks:
MOVQ ·prime5v(SB), AX
afterBlocks:
ADDQ DX, AX
// Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
ADDQ $24, BX
CMPQ CX, BX
JG fourByte
wordLoop:
// Calculate k1.
MOVQ (CX), R8
ADDQ $8, CX
IMULQ R14, R8
ROLQ $31, R8
IMULQ R13, R8
XORQ R8, AX
ROLQ $27, AX
IMULQ R13, AX
ADDQ R15, AX
CMPQ CX, BX
JLE wordLoop
fourByte:
ADDQ $4, BX
CMPQ CX, BX
JG singles
MOVL (CX), R8
ADDQ $4, CX
IMULQ R13, R8
XORQ R8, AX
ROLQ $23, AX
IMULQ R14, AX
ADDQ ·prime3v(SB), AX
singles:
ADDQ $4, BX
CMPQ CX, BX
JGE finalize
singlesLoop:
MOVBQZX (CX), R12
ADDQ $1, CX
IMULQ ·prime5v(SB), R12
XORQ R12, AX
ROLQ $11, AX
IMULQ R13, AX
CMPQ CX, BX
JL singlesLoop
finalize:
MOVQ AX, R12
SHRQ $33, R12
XORQ R12, AX
IMULQ R14, AX
MOVQ AX, R12
SHRQ $29, R12
XORQ R12, AX
IMULQ ·prime3v(SB), AX
MOVQ AX, R12
SHRQ $32, R12
XORQ R12, AX
MOVQ AX, ret+24(FP)
RET
// writeBlocks uses the same registers as above except that it uses AX to store
// the d pointer.
// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT, $0-40
// Load fixed primes needed for round.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
// Load slice.
MOVQ b_base+8(FP), CX
MOVQ b_len+16(FP), DX
LEAQ (CX)(DX*1), BX
SUBQ $32, BX
// Load vN from d.
MOVQ d+0(FP), AX
MOVQ 0(AX), R8 // v1
MOVQ 8(AX), R9 // v2
MOVQ 16(AX), R10 // v3
MOVQ 24(AX), R11 // v4
// We don't need to check the loop condition here; this function is
// always called with at least one block of data to process.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
CMPQ CX, BX
JLE blockLoop
// Copy vN back to d.
MOVQ R8, 0(AX)
MOVQ R9, 8(AX)
MOVQ R10, 16(AX)
MOVQ R11, 24(AX)
// The number of bytes written is CX minus the old base pointer.
SUBQ b_base+8(FP), CX
MOVQ CX, ret+32(FP)
RET

@ -0,0 +1,76 @@
// +build !amd64 appengine !gc purego
package xxhash
// Sum64 computes the 64-bit xxHash digest of b.
func Sum64(b []byte) uint64 {
// A simpler version would be
// d := New()
// d.Write(b)
// return d.Sum64()
// but this is faster, particularly for small inputs.
n := len(b)
var h uint64
if n >= 32 {
v1 := prime1v + prime2
v2 := prime2
v3 := uint64(0)
v4 := -prime1v
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)]))
v3 = round(v3, u64(b[16:24:len(b)]))
v4 = round(v4, u64(b[24:32:len(b)]))
b = b[32:len(b):len(b)]
}
h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
h = mergeRound(h, v1)
h = mergeRound(h, v2)
h = mergeRound(h, v3)
h = mergeRound(h, v4)
} else {
h = prime5
}
h += uint64(n)
i, end := 0, len(b)
for ; i+8 <= end; i += 8 {
k1 := round(0, u64(b[i:i+8:len(b)]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
if i+4 <= end {
h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
h = rol23(h)*prime2 + prime3
i += 4
}
for ; i < end; i++ {
h ^= uint64(b[i]) * prime5
h = rol11(h) * prime1
}
h ^= h >> 33
h *= prime2
h ^= h >> 29
h *= prime3
h ^= h >> 32
return h
}
func writeBlocks(d *Digest, b []byte) int {
v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
n := len(b)
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)]))
v3 = round(v3, u64(b[16:24:len(b)]))
v4 = round(v4, u64(b[24:32:len(b)]))
b = b[32:len(b):len(b)]
}
d.v1, d.v2, d.v3, d.v4 = v1, v2, v3, v4
return n - len(b)
}

@ -0,0 +1,15 @@
// +build appengine
// This file contains the safe implementations of otherwise unsafe-using code.
package xxhash
// Sum64String computes the 64-bit xxHash digest of s.
func Sum64String(s string) uint64 {
return Sum64([]byte(s))
}
// WriteString adds more data to d. It always returns len(s), nil.
func (d *Digest) WriteString(s string) (n int, err error) {
return d.Write([]byte(s))
}

@ -0,0 +1,46 @@
// +build !appengine
// This file encapsulates usage of unsafe.
// xxhash_safe.go contains the safe implementations.
package xxhash
import (
"reflect"
"unsafe"
)
// Notes:
//
// See https://groups.google.com/d/msg/golang-nuts/dcjzJy-bSpw/tcZYBzQqAQAJ
// for some discussion about these unsafe conversions.
//
// In the future it's possible that compiler optimizations will make these
// unsafe operations unnecessary: https://golang.org/issue/2205.
//
// Both of these wrapper functions still incur function call overhead since they
// will not be inlined. We could write Go/asm copies of Sum64 and Digest.Write
// for strings to squeeze out a bit more speed. Mid-stack inlining should
// eventually fix this.
// Sum64String computes the 64-bit xxHash digest of s.
// It may be faster than Sum64([]byte(s)) by avoiding a copy.
func Sum64String(s string) uint64 {
var b []byte
bh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
bh.Data = (*reflect.StringHeader)(unsafe.Pointer(&s)).Data
bh.Len = len(s)
bh.Cap = len(s)
return Sum64(b)
}
// WriteString adds more data to d. It always returns len(s), nil.
// It may be faster than Write([]byte(s)) by avoiding a copy.
func (d *Digest) WriteString(s string) (n int, err error) {
var b []byte
bh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
bh.Data = (*reflect.StringHeader)(unsafe.Pointer(&s)).Data
bh.Len = len(s)
bh.Cap = len(s)
return d.Write(b)
}

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2014 Brian Goff
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,14 @@
package md2man
import (
"github.com/russross/blackfriday/v2"
)
// Render converts a markdown document into a roff formatted document.
func Render(doc []byte) []byte {
renderer := NewRoffRenderer()
return blackfriday.Run(doc,
[]blackfriday.Option{blackfriday.WithRenderer(renderer),
blackfriday.WithExtensions(renderer.GetExtensions())}...)
}

@ -0,0 +1,345 @@
package md2man
import (
"fmt"
"io"
"os"
"strings"
"github.com/russross/blackfriday/v2"
)
// roffRenderer implements the blackfriday.Renderer interface for creating
// roff format (manpages) from markdown text
type roffRenderer struct {
extensions blackfriday.Extensions
listCounters []int
firstHeader bool
defineTerm bool
listDepth int
}
const (
titleHeader = ".TH "
topLevelHeader = "\n\n.SH "
secondLevelHdr = "\n.SH "
otherHeader = "\n.SS "
crTag = "\n"
emphTag = "\\fI"
emphCloseTag = "\\fP"
strongTag = "\\fB"
strongCloseTag = "\\fP"
breakTag = "\n.br\n"
paraTag = "\n.PP\n"
hruleTag = "\n.ti 0\n\\l'\\n(.lu'\n"
linkTag = "\n\\[la]"
linkCloseTag = "\\[ra]"
codespanTag = "\\fB\\fC"
codespanCloseTag = "\\fR"
codeTag = "\n.PP\n.RS\n\n.nf\n"
codeCloseTag = "\n.fi\n.RE\n"
quoteTag = "\n.PP\n.RS\n"
quoteCloseTag = "\n.RE\n"
listTag = "\n.RS\n"
listCloseTag = "\n.RE\n"
arglistTag = "\n.TP\n"
tableStart = "\n.TS\nallbox;\n"
tableEnd = ".TE\n"
tableCellStart = "T{\n"
tableCellEnd = "\nT}\n"
)
// NewRoffRenderer creates a new blackfriday Renderer for generating roff documents
// from markdown
func NewRoffRenderer() *roffRenderer { // nolint: golint
var extensions blackfriday.Extensions
extensions |= blackfriday.NoIntraEmphasis
extensions |= blackfriday.Tables
extensions |= blackfriday.FencedCode
extensions |= blackfriday.SpaceHeadings
extensions |= blackfriday.Footnotes
extensions |= blackfriday.Titleblock
extensions |= blackfriday.DefinitionLists
return &roffRenderer{
extensions: extensions,
}
}
// GetExtensions returns the list of extensions used by this renderer implementation
func (r *roffRenderer) GetExtensions() blackfriday.Extensions {
return r.extensions
}
// RenderHeader handles outputting the header at document start
func (r *roffRenderer) RenderHeader(w io.Writer, ast *blackfriday.Node) {
// disable hyphenation
out(w, ".nh\n")
}
// RenderFooter handles outputting the footer at the document end; the roff
// renderer has no footer information
func (r *roffRenderer) RenderFooter(w io.Writer, ast *blackfriday.Node) {
}
// RenderNode is called for each node in a markdown document; based on the node
// type the equivalent roff output is sent to the writer
func (r *roffRenderer) RenderNode(w io.Writer, node *blackfriday.Node, entering bool) blackfriday.WalkStatus {
var walkAction = blackfriday.GoToNext
switch node.Type {
case blackfriday.Text:
r.handleText(w, node, entering)
case blackfriday.Softbreak:
out(w, crTag)
case blackfriday.Hardbreak:
out(w, breakTag)
case blackfriday.Emph:
if entering {
out(w, emphTag)
} else {
out(w, emphCloseTag)
}
case blackfriday.Strong:
if entering {
out(w, strongTag)
} else {
out(w, strongCloseTag)
}
case blackfriday.Link:
if !entering {
out(w, linkTag+string(node.LinkData.Destination)+linkCloseTag)
}
case blackfriday.Image:
// ignore images
walkAction = blackfriday.SkipChildren
case blackfriday.Code:
out(w, codespanTag)
escapeSpecialChars(w, node.Literal)
out(w, codespanCloseTag)
case blackfriday.Document:
break
case blackfriday.Paragraph:
// roff .PP markers break lists
if r.listDepth > 0 {
return blackfriday.GoToNext
}
if entering {
out(w, paraTag)
} else {
out(w, crTag)
}
case blackfriday.BlockQuote:
if entering {
out(w, quoteTag)
} else {
out(w, quoteCloseTag)
}
case blackfriday.Heading:
r.handleHeading(w, node, entering)
case blackfriday.HorizontalRule:
out(w, hruleTag)
case blackfriday.List:
r.handleList(w, node, entering)
case blackfriday.Item:
r.handleItem(w, node, entering)
case blackfriday.CodeBlock:
out(w, codeTag)
escapeSpecialChars(w, node.Literal)
out(w, codeCloseTag)
case blackfriday.Table:
r.handleTable(w, node, entering)
case blackfriday.TableCell:
r.handleTableCell(w, node, entering)
case blackfriday.TableHead:
case blackfriday.TableBody:
case blackfriday.TableRow:
// no action as cell entries do all the nroff formatting
return blackfriday.GoToNext
default:
fmt.Fprintln(os.Stderr, "WARNING: go-md2man does not handle node type "+node.Type.String())
}
return walkAction
}
func (r *roffRenderer) handleText(w io.Writer, node *blackfriday.Node, entering bool) {
var (
start, end string
)
// handle special roff table cell text encapsulation
if node.Parent.Type == blackfriday.TableCell {
if len(node.Literal) > 30 {
start = tableCellStart
end = tableCellEnd
} else {
// end rows that aren't terminated by "tableCellEnd" with a cr if end of row
if node.Parent.Next == nil && !node.Parent.IsHeader {
end = crTag
}
}
}
out(w, start)
escapeSpecialChars(w, node.Literal)
out(w, end)
}
func (r *roffRenderer) handleHeading(w io.Writer, node *blackfriday.Node, entering bool) {
if entering {
switch node.Level {
case 1:
if !r.firstHeader {
out(w, titleHeader)
r.firstHeader = true
break
}
out(w, topLevelHeader)
case 2:
out(w, secondLevelHdr)
default:
out(w, otherHeader)
}
}
}
func (r *roffRenderer) handleList(w io.Writer, node *blackfriday.Node, entering bool) {
openTag := listTag
closeTag := listCloseTag
if node.ListFlags&blackfriday.ListTypeDefinition != 0 {
// tags for definition lists handled within Item node
openTag = ""
closeTag = ""
}
if entering {
r.listDepth++
if node.ListFlags&blackfriday.ListTypeOrdered != 0 {
r.listCounters = append(r.listCounters, 1)
}
out(w, openTag)
} else {
if node.ListFlags&blackfriday.ListTypeOrdered != 0 {
r.listCounters = r.listCounters[:len(r.listCounters)-1]
}
out(w, closeTag)
r.listDepth--
}
}
func (r *roffRenderer) handleItem(w io.Writer, node *blackfriday.Node, entering bool) {
if entering {
if node.ListFlags&blackfriday.ListTypeOrdered != 0 {
out(w, fmt.Sprintf(".IP \"%3d.\" 5\n", r.listCounters[len(r.listCounters)-1]))
r.listCounters[len(r.listCounters)-1]++
} else if node.ListFlags&blackfriday.ListTypeDefinition != 0 {
// state machine for handling terms and following definitions
// since blackfriday does not distinguish them properly, nor
// does it seperate them into separate lists as it should
if !r.defineTerm {
out(w, arglistTag)
r.defineTerm = true
} else {
r.defineTerm = false
}
} else {
out(w, ".IP \\(bu 2\n")
}
} else {
out(w, "\n")
}
}
func (r *roffRenderer) handleTable(w io.Writer, node *blackfriday.Node, entering bool) {
if entering {
out(w, tableStart)
//call walker to count cells (and rows?) so format section can be produced
columns := countColumns(node)
out(w, strings.Repeat("l ", columns)+"\n")
out(w, strings.Repeat("l ", columns)+".\n")
} else {
out(w, tableEnd)
}
}
func (r *roffRenderer) handleTableCell(w io.Writer, node *blackfriday.Node, entering bool) {
var (
start, end string
)
if node.IsHeader {
start = codespanTag
end = codespanCloseTag
}
if entering {
if node.Prev != nil && node.Prev.Type == blackfriday.TableCell {
out(w, "\t"+start)
} else {
out(w, start)
}
} else {
// need to carriage return if we are at the end of the header row
if node.IsHeader && node.Next == nil {
end = end + crTag
}
out(w, end)
}
}
// because roff format requires knowing the column count before outputting any table
// data we need to walk a table tree and count the columns
func countColumns(node *blackfriday.Node) int {
var columns int
node.Walk(func(node *blackfriday.Node, entering bool) blackfriday.WalkStatus {
switch node.Type {
case blackfriday.TableRow:
if !entering {
return blackfriday.Terminate
}
case blackfriday.TableCell:
if entering {
columns++
}
default:
}
return blackfriday.GoToNext
})
return columns
}
func out(w io.Writer, output string) {
io.WriteString(w, output) // nolint: errcheck
}
func needsBackslash(c byte) bool {
for _, r := range []byte("-_&\\~") {
if c == r {
return true
}
}
return false
}
func escapeSpecialChars(w io.Writer, text []byte) {
for i := 0; i < len(text); i++ {
// escape initial apostrophe or period
if len(text) >= 1 && (text[0] == '\'' || text[0] == '.') {
out(w, "\\&")
}
// directly copy normal characters
org := i
for i < len(text) && !needsBackslash(text[i]) {
i++
}
if i > org {
w.Write(text[org:i]) // nolint: errcheck
}
// escape a character
if i >= len(text) {
break
}
w.Write([]byte{'\\', text[i]}) // nolint: errcheck
}
}

@ -43,8 +43,8 @@ The __last__ capture is embedded in each group, so `g.String()` will return the
| Category | regexp | regexp2 |
| --- | --- | --- |
| Catastrophic backtracking possible | no, constant execution time guarantees | yes, if your pattern is at risk you can use the `re.MatchTimeout` field |
| Python-style capture groups `(P<name>re)` | yes | no |
| .NET-style capture groups `(<name>re)` or `('name're)` | no | yes |
| Python-style capture groups `(?P<name>re)` | yes | no (yes in RE2 compat mode) |
| .NET-style capture groups `(?<name>re)` or `(?'name're)` | no | yes |
| comments `(?#comment)` | no | yes |
| branch numbering reset `(?\|a\|b)` | no | no |
| possessive match `(?>re)` | no | yes |
@ -54,13 +54,14 @@ The __last__ capture is embedded in each group, so `g.String()` will return the
| negative lookbehind `(?<!re)` | no | yes |
| back reference `\1` | no | yes |
| named back reference `\k'name'` | no | yes |
| named ascii character class `[[:foo:]]`| yes | no |
| conditionals `((expr)yes\|no)` | no | yes |
| named ascii character class `[[:foo:]]`| yes | no (yes in RE2 compat mode) |
| conditionals `(?(expr)yes\|no)` | no | yes |
## RE2 compatibility mode
The default behavior of `regexp2` is to match the .NET regexp engine, however the `RE2` option is provided to change the parsing to increase compatibility with RE2. Using the `RE2` option when compiling a regexp will not take away any features, but will change the following behaviors:
* add support for named ascii character classes (e.g. `[[:foo:]]`)
* add support for python-style capture groups (e.g. `(P<name>re)`)
* change singleline behavior for `$` to only match end of string (like RE2) (see [#24](https://github.com/dlclark/regexp2/issues/24))
```go
re := regexp2.MustCompile(`Your RE2-compatible pattern`, regexp2.RE2)

@ -566,9 +566,22 @@ func (r *runner) execute() error {
continue
case syntax.EndZ:
if r.rightchars() > 1 || r.rightchars() == 1 && r.charAt(r.textPos()) != '\n' {
rchars := r.rightchars()
if rchars > 1 {
break
}
// RE2 and EcmaScript define $ as "asserts position at the end of the string"
// PCRE/.NET adds "or before the line terminator right at the end of the string (if any)"
if (r.re.options & (RE2 | ECMAScript)) != 0 {
// RE2/Ecmascript mode
if rchars > 0 {
break
}
} else if rchars == 1 && r.charAt(r.textPos()) != '\n' {
// "regular" mode
break
}
r.advance(0)
continue
@ -938,8 +951,8 @@ func (r *runner) advance(i int) {
}
func (r *runner) goTo(newpos int) {
// when branching backward, ensure storage
if newpos < r.codepos {
// when branching backward or in place, ensure storage
if newpos <= r.codepos {
r.ensureStorage()
}

@ -1648,7 +1648,7 @@ func (p *parser) scanOptions() {
}
// Scans \ code for escape codes that map to single unicode chars.
func (p *parser) scanCharEscape() (rune, error) {
func (p *parser) scanCharEscape() (r rune, err error) {
ch := p.moveRightGetChar()
@ -1657,16 +1657,22 @@ func (p *parser) scanCharEscape() (rune, error) {
return p.scanOctal(), nil
}
pos := p.textpos()
switch ch {
case 'x':
// support for \x{HEX} syntax from Perl and PCRE
if p.charsRight() > 0 && p.rightChar(0) == '{' {
if p.useOptionE() {
return ch, nil
}
p.moveRight(1)
return p.scanHexUntilBrace()
} else {
r, err = p.scanHex(2)
}
return p.scanHex(2)
case 'u':
return p.scanHex(4)
r, err = p.scanHex(4)
case 'a':
return '\u0007', nil
case 'b':
@ -1684,13 +1690,18 @@ func (p *parser) scanCharEscape() (rune, error) {
case 'v':
return '\u000B', nil
case 'c':
return p.scanControl()
r, err = p.scanControl()
default:
if !p.useOptionE() && IsWordChar(ch) {
return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
}
return ch, nil
}
if err != nil && p.useOptionE() {
p.textto(pos)
return ch, nil
}
return
}
// Grabs and converts an ascii control character
@ -1807,7 +1818,7 @@ func (p *parser) scanOctal() rune {
//we know the first char is good because the caller had to check
i := 0
d := int(p.rightChar(0) - '0')
for c > 0 && d <= 7 {
for c > 0 && d <= 7 && d >= 0 {
if i >= 0x20 && p.useOptionE() {
break
}

@ -1,4 +1,4 @@
; http://editorconfig.org/
; https://editorconfig.org/
root = true
@ -9,6 +9,15 @@ trim_trailing_whitespace = true
indent_style = space
indent_size = 2
[{Makefile,go.mod,go.sum,*.go}]
[{Makefile,go.mod,go.sum,*.go,.gitmodules}]
indent_style = tab
indent_size = 8
[*.md]
indent_size = 4
trim_trailing_whitespace = false
eclint_indent_style = unset
[Dockerfile]
indent_size = 4

@ -0,0 +1,19 @@
linters-settings:
golint:
min-confidence: 0.3
linters:
enable:
- goconst
- gocyclo
- goerr113
- gofmt
- godox
- golint
- gocritic
- megacheck
- misspell
- prealloc
- unparam
- scopelint
- wsl

@ -0,0 +1,74 @@
project_name: editorconfig-core-go
before:
hooks:
- go mod tidy
builds:
- id: editorconfig
main: ./cmd/editorconfig/main.go
binary: editorconfig
env:
- CGO_ENABLED=0
goos:
- linux
- darwin
- windows
archives:
- id: tarball
builds:
- editorconfig
format_overrides:
- goos: windows
format: zip
files:
- none*
dockers:
- image_templates:
- docker.pkg.github.com/editorconfig/editorconfig-core-go/editorconfig:latest
- docker.pkg.github.com/editorconfig/editorconfig-core-go/editorconfig:{{ .Tag }}
- docker.pkg.github.com/editorconfig/editorconfig-core-go/editorconfig:v{{ .Major }}
- docker.pkg.github.com/editorconfig/editorconfig-core-go/editorconfig:v{{ .Major }}.{{ .Minor }}
goos: linux
goarch: amd64
binaries:
- editorconfig
build_flag_templates:
- "--pull"
- "--label=org.label-schema.schema-version=1.0"
- "--label=org.label-schema.version={{ .Version }}"
- "--label=org.label-schema.name={{ .ProjectName }}"
nfpms:
- vendor: EditorConfig
homepage: https://github.com/editorconfig/editorconfig-core-go
maintainer: Yoan Blanc <yoan@dosimple.ch>
formats:
- deb
- rpm
epoch: 1
release: 1
checksum:
name_template: 'checksums.txt'
signs:
- artifacts: checksum
snapshot:
name_template: "{{ .Tag }}-development"
changelog:
sort: asc
filters:
exclude:
- '^docs:'
- '^test:'
release:
github:
owner: editorconfig
name: editorconfig-core-go
draft: true

@ -1,30 +0,0 @@
---
language: go
dist: xenial
sudo: true
go:
- '1.11.x'
- '1.12.x'
compiler:
- gcc
install:
# first we create a directory for the CMake binaries
- DEPS_DIR="${TRAVIS_BUILD_DIR}/deps"
- mkdir ${DEPS_DIR} && cd ${DEPS_DIR}
# we use wget to fetch the cmake binaries
- travis_retry wget --no-check-certificate https://cmake.org/files/v3.14/cmake-3.14.6-Linux-x86_64.tar.gz
- echo "82e08e50ba921035efa82b859c74c5fbe27d3e49a4003020e3c77618a4e912cd cmake-3.14.6-Linux-x86_64.tar.gz" > sha256sum.txt
- sha256sum -c sha256sum.txt
- tar -xvf cmake-3.14.6-Linux-x86_64.tar.gz > /dev/null
- mv cmake-3.14.6-Linux-x86_64 cmake-install
- PATH=${DEPS_DIR}/cmake-install:${DEPS_DIR}/cmake-install/bin:$PATH
- cd ${TRAVIS_BUILD_DIR}
env:
- GO111MODULE=on
script:
- make test

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save