Move modules/gzip to gitea.com/macaron/gzip (#9058)
* Move modules/gzip to gitea.com/macaron/gzip * Fix vendortokarchuk/v1.17
parent
ba4e8f221b
commit
9ff6312627
@ -1,131 +0,0 @@ |
||||
// Copyright 2019 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package gzip |
||||
|
||||
import ( |
||||
"archive/zip" |
||||
"bytes" |
||||
"io/ioutil" |
||||
"net/http" |
||||
"net/http/httptest" |
||||
"testing" |
||||
|
||||
"gitea.com/macaron/macaron" |
||||
gzipp "github.com/klauspost/compress/gzip" |
||||
"github.com/stretchr/testify/assert" |
||||
) |
||||
|
||||
func setup(sampleResponse []byte) (*macaron.Macaron, *[]byte) { |
||||
m := macaron.New() |
||||
m.Use(Middleware()) |
||||
m.Get("/", func() *[]byte { return &sampleResponse }) |
||||
return m, &sampleResponse |
||||
} |
||||
|
||||
func reqNoAcceptGzip(t *testing.T, m *macaron.Macaron, sampleResponse *[]byte) { |
||||
// Request without accept gzip: Should not gzip
|
||||
resp := httptest.NewRecorder() |
||||
req, err := http.NewRequest("GET", "/", nil) |
||||
assert.NoError(t, err) |
||||
m.ServeHTTP(resp, req) |
||||
|
||||
_, ok := resp.HeaderMap[contentEncodingHeader] |
||||
assert.False(t, ok) |
||||
|
||||
contentEncoding := resp.Header().Get(contentEncodingHeader) |
||||
assert.NotContains(t, contentEncoding, "gzip") |
||||
|
||||
result := resp.Body.Bytes() |
||||
assert.Equal(t, *sampleResponse, result) |
||||
} |
||||
|
||||
func reqAcceptGzip(t *testing.T, m *macaron.Macaron, sampleResponse *[]byte, expectGzip bool) { |
||||
// Request without accept gzip: Should not gzip
|
||||
resp := httptest.NewRecorder() |
||||
req, err := http.NewRequest("GET", "/", nil) |
||||
assert.NoError(t, err) |
||||
req.Header.Set(acceptEncodingHeader, "gzip") |
||||
m.ServeHTTP(resp, req) |
||||
|
||||
_, ok := resp.HeaderMap[contentEncodingHeader] |
||||
assert.Equal(t, ok, expectGzip) |
||||
|
||||
contentEncoding := resp.Header().Get(contentEncodingHeader) |
||||
if expectGzip { |
||||
assert.Contains(t, contentEncoding, "gzip") |
||||
gzippReader, err := gzipp.NewReader(resp.Body) |
||||
assert.NoError(t, err) |
||||
result, err := ioutil.ReadAll(gzippReader) |
||||
assert.NoError(t, err) |
||||
assert.Equal(t, *sampleResponse, result) |
||||
} else { |
||||
assert.NotContains(t, contentEncoding, "gzip") |
||||
result := resp.Body.Bytes() |
||||
assert.Equal(t, *sampleResponse, result) |
||||
} |
||||
} |
||||
|
||||
func TestMiddlewareSmall(t *testing.T) { |
||||
m, sampleResponse := setup([]byte("Small response")) |
||||
|
||||
reqNoAcceptGzip(t, m, sampleResponse) |
||||
|
||||
reqAcceptGzip(t, m, sampleResponse, false) |
||||
} |
||||
|
||||
func TestMiddlewareLarge(t *testing.T) { |
||||
b := make([]byte, MinSize+1) |
||||
for i := range b { |
||||
b[i] = byte(i % 256) |
||||
} |
||||
m, sampleResponse := setup(b) |
||||
|
||||
reqNoAcceptGzip(t, m, sampleResponse) |
||||
|
||||
// This should be gzipped as we accept gzip
|
||||
reqAcceptGzip(t, m, sampleResponse, true) |
||||
} |
||||
|
||||
func TestMiddlewareGzip(t *testing.T) { |
||||
b := make([]byte, MinSize*10) |
||||
for i := range b { |
||||
b[i] = byte(i % 256) |
||||
} |
||||
outputBuffer := bytes.NewBuffer([]byte{}) |
||||
gzippWriter := gzipp.NewWriter(outputBuffer) |
||||
gzippWriter.Write(b) |
||||
gzippWriter.Flush() |
||||
gzippWriter.Close() |
||||
output := outputBuffer.Bytes() |
||||
|
||||
m, sampleResponse := setup(output) |
||||
|
||||
reqNoAcceptGzip(t, m, sampleResponse) |
||||
|
||||
// This should not be gzipped even though we accept gzip
|
||||
reqAcceptGzip(t, m, sampleResponse, false) |
||||
} |
||||
|
||||
func TestMiddlewareZip(t *testing.T) { |
||||
b := make([]byte, MinSize*10) |
||||
for i := range b { |
||||
b[i] = byte(i % 256) |
||||
} |
||||
outputBuffer := bytes.NewBuffer([]byte{}) |
||||
zipWriter := zip.NewWriter(outputBuffer) |
||||
fileWriter, err := zipWriter.Create("default") |
||||
assert.NoError(t, err) |
||||
fileWriter.Write(b) |
||||
//fileWriter.Close()
|
||||
zipWriter.Close() |
||||
output := outputBuffer.Bytes() |
||||
|
||||
m, sampleResponse := setup(output) |
||||
|
||||
reqNoAcceptGzip(t, m, sampleResponse) |
||||
|
||||
// This should not be gzipped even though we accept gzip
|
||||
reqAcceptGzip(t, m, sampleResponse, false) |
||||
} |
@ -0,0 +1,9 @@ |
||||
module gitea.com/macaron/gzip |
||||
|
||||
go 1.12 |
||||
|
||||
require ( |
||||
gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb |
||||
github.com/klauspost/compress v1.9.2 |
||||
github.com/stretchr/testify v1.4.0 |
||||
) |
@ -0,0 +1,42 @@ |
||||
gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591 h1:UbCTjPcLrNxR9LzKDjQBMT2zoxZuEnca1pZCpgeMuhQ= |
||||
gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM= |
||||
gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb h1:amL0md6orTj1tXY16ANzVU9FmzQB+W7aJwp8pVDbrmA= |
||||
gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb/go.mod h1:0coI+mSPSwbsyAbOuFllVS38awuk9mevhLD52l50Gjs= |
||||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= |
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= |
||||
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= |
||||
github.com/gopherjs/gopherjs v0.0.0-20181103185306-d547d1d9531e h1:JKmoR8x90Iww1ks85zJ1lfDGgIiMDuIptTOhJq+zKyg= |
||||
github.com/gopherjs/gopherjs v0.0.0-20181103185306-d547d1d9531e/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= |
||||
github.com/jtolds/gls v4.2.1+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= |
||||
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= |
||||
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= |
||||
github.com/klauspost/compress v1.9.2 h1:LfVyl+ZlLlLDeQ/d2AqfGIIH4qEDu0Ed2S5GyhCWIWY= |
||||
github.com/klauspost/compress v1.9.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= |
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= |
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= |
||||
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= |
||||
github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304 h1:Jpy1PXuP99tXNrhbq2BaPz9B+jNAvH1JPQQpG/9GCXY= |
||||
github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= |
||||
github.com/smartystreets/goconvey v0.0.0-20181108003508-044398e4856c/go.mod h1:XDJAKZRPZ1CvBcN2aX5YOUTYGHki24fSF0Iv48Ibg0s= |
||||
github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 h1:WN9BUFbdyOsSH/XohnWpXOlq9NBD5sGAB2FciQMUEe8= |
||||
github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= |
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= |
||||
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= |
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= |
||||
github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e h1:GSGeB9EAKY2spCABz6xOX5DbxZEXolK+nBSvmsQwRjM= |
||||
github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM= |
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= |
||||
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc= |
||||
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= |
||||
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= |
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= |
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= |
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= |
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= |
||||
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= |
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= |
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= |
||||
gopkg.in/ini.v1 v1.44.0 h1:YRJzTUp0kSYWUVFF5XAbDFfyiqwsl0Vb9R8TVP5eRi0= |
||||
gopkg.in/ini.v1 v1.44.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= |
||||
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= |
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= |
@ -1,32 +0,0 @@ |
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package flate |
||||
|
||||
// forwardCopy is like the built-in copy function except that it always goes
|
||||
// forward from the start, even if the dst and src overlap.
|
||||
// It is equivalent to:
|
||||
// for i := 0; i < n; i++ {
|
||||
// mem[dst+i] = mem[src+i]
|
||||
// }
|
||||
func forwardCopy(mem []byte, dst, src, n int) { |
||||
if dst <= src { |
||||
copy(mem[dst:dst+n], mem[src:src+n]) |
||||
return |
||||
} |
||||
for { |
||||
if dst >= src+n { |
||||
copy(mem[dst:dst+n], mem[src:src+n]) |
||||
return |
||||
} |
||||
// There is some forward overlap. The destination
|
||||
// will be filled with a repeated pattern of mem[src:src+k].
|
||||
// We copy one instance of the pattern here, then repeat.
|
||||
// Each time around this loop k will double.
|
||||
k := dst - src |
||||
copy(mem[dst:dst+k], mem[src:src+k]) |
||||
n -= k |
||||
dst += k |
||||
} |
||||
} |
@ -1,41 +0,0 @@ |
||||
//+build !noasm
|
||||
//+build !appengine
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
|
||||
package flate |
||||
|
||||
import ( |
||||
"github.com/klauspost/cpuid" |
||||
) |
||||
|
||||
// crc32sse returns a hash for the first 4 bytes of the slice
|
||||
// len(a) must be >= 4.
|
||||
//go:noescape
|
||||
func crc32sse(a []byte) uint32 |
||||
|
||||
// crc32sseAll calculates hashes for each 4-byte set in a.
|
||||
// dst must be east len(a) - 4 in size.
|
||||
// The size is not checked by the assembly.
|
||||
//go:noescape
|
||||
func crc32sseAll(a []byte, dst []uint32) |
||||
|
||||
// matchLenSSE4 returns the number of matching bytes in a and b
|
||||
// up to length 'max'. Both slices must be at least 'max'
|
||||
// bytes in size.
|
||||
//
|
||||
// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions.
|
||||
//
|
||||
//go:noescape
|
||||
func matchLenSSE4(a, b []byte, max int) int |
||||
|
||||
// histogram accumulates a histogram of b in h.
|
||||
// h must be at least 256 entries in length,
|
||||
// and must be cleared before calling this function.
|
||||
//go:noescape
|
||||
func histogram(b []byte, h []int32) |
||||
|
||||
// Detect SSE 4.2 feature.
|
||||
func init() { |
||||
useSSE42 = cpuid.CPU.SSE42() |
||||
} |
@ -1,213 +0,0 @@ |
||||
//+build !noasm |
||||
//+build !appengine |
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details. |
||||
|
||||
// func crc32sse(a []byte) uint32 |
||||
TEXT ·crc32sse(SB), 4, $0 |
||||
MOVQ a+0(FP), R10 |
||||
XORQ BX, BX |
||||
|
||||
// CRC32 dword (R10), EBX |
||||
BYTE $0xF2; BYTE $0x41; BYTE $0x0f
|
||||
BYTE $0x38; BYTE $0xf1; BYTE $0x1a
|
||||
|
||||
MOVL BX, ret+24(FP) |
||||
RET |
||||
|
||||
// func crc32sseAll(a []byte, dst []uint32) |
||||
TEXT ·crc32sseAll(SB), 4, $0 |
||||
MOVQ a+0(FP), R8 // R8: src |
||||
MOVQ a_len+8(FP), R10 // input length |
||||
MOVQ dst+24(FP), R9 // R9: dst |
||||
SUBQ $4, R10 |
||||
JS end |
||||
JZ one_crc |
||||
MOVQ R10, R13 |
||||
SHRQ $2, R10 // len/4 |
||||
ANDQ $3, R13 // len&3 |
||||
XORQ BX, BX |
||||
ADDQ $1, R13 |
||||
TESTQ R10, R10 |
||||
JZ rem_loop |
||||
|
||||
crc_loop: |
||||
MOVQ (R8), R11 |
||||
XORQ BX, BX |
||||
XORQ DX, DX |
||||
XORQ DI, DI |
||||
MOVQ R11, R12 |
||||
SHRQ $8, R11 |
||||
MOVQ R12, AX |
||||
MOVQ R11, CX |
||||
SHRQ $16, R12 |
||||
SHRQ $16, R11 |
||||
MOVQ R12, SI |
||||
|
||||
// CRC32 EAX, EBX |
||||
BYTE $0xF2; BYTE $0x0f
|
||||
BYTE $0x38; BYTE $0xf1; BYTE $0xd8
|
||||
|
||||
// CRC32 ECX, EDX |
||||
BYTE $0xF2; BYTE $0x0f
|
||||
BYTE $0x38; BYTE $0xf1; BYTE $0xd1
|
||||
|
||||
// CRC32 ESI, EDI |
||||
BYTE $0xF2; BYTE $0x0f
|
||||
BYTE $0x38; BYTE $0xf1; BYTE $0xfe
|
||||
MOVL BX, (R9) |
||||
MOVL DX, 4(R9) |
||||
MOVL DI, 8(R9) |
||||
|
||||
XORQ BX, BX |
||||
MOVL R11, AX |
||||
|
||||
// CRC32 EAX, EBX |
||||
BYTE $0xF2; BYTE $0x0f
|
||||
BYTE $0x38; BYTE $0xf1; BYTE $0xd8
|
||||
MOVL BX, 12(R9) |
||||
|
||||
ADDQ $16, R9 |
||||
ADDQ $4, R8 |
||||
XORQ BX, BX |
||||
SUBQ $1, R10 |
||||
JNZ crc_loop |
||||
|
||||
rem_loop: |
||||
MOVL (R8), AX |
||||
|
||||
// CRC32 EAX, EBX |
||||
BYTE $0xF2; BYTE $0x0f
|
||||
BYTE $0x38; BYTE $0xf1; BYTE $0xd8
|
||||
|
||||
MOVL BX, (R9) |
||||
ADDQ $4, R9 |
||||
ADDQ $1, R8 |
||||
XORQ BX, BX |
||||
SUBQ $1, R13 |
||||
JNZ rem_loop |
||||
|
||||
end: |
||||
RET |
||||
|
||||
one_crc: |
||||
MOVQ $1, R13 |
||||
XORQ BX, BX |
||||
JMP rem_loop |
||||
|
||||
// func matchLenSSE4(a, b []byte, max int) int |
||||
TEXT ·matchLenSSE4(SB), 4, $0 |
||||
MOVQ a_base+0(FP), SI |
||||
MOVQ b_base+24(FP), DI |
||||
MOVQ DI, DX |
||||
MOVQ max+48(FP), CX |
||||
|
||||
cmp8: |
||||
// As long as we are 8 or more bytes before the end of max, we can load and |
||||
// compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
||||
CMPQ CX, $8 |
||||
JLT cmp1 |
||||
MOVQ (SI), AX |
||||
MOVQ (DI), BX |
||||
CMPQ AX, BX |
||||
JNE bsf |
||||
ADDQ $8, SI |
||||
ADDQ $8, DI |
||||
SUBQ $8, CX |
||||
JMP cmp8 |
||||
|
||||
bsf: |
||||
// If those 8 bytes were not equal, XOR the two 8 byte values, and return |
||||
// the index of the first byte that differs. The BSF instruction finds the |
||||
// least significant 1 bit, the amd64 architecture is little-endian, and |
||||
// the shift by 3 converts a bit index to a byte index. |
||||
XORQ AX, BX |
||||
BSFQ BX, BX |
||||
SHRQ $3, BX |
||||
ADDQ BX, DI |
||||
|
||||
// Subtract off &b[0] to convert from &b[ret] to ret, and return. |
||||
SUBQ DX, DI |
||||
MOVQ DI, ret+56(FP) |
||||
RET |
||||
|
||||
cmp1: |
||||
// In the slices' tail, compare 1 byte at a time. |
||||
CMPQ CX, $0 |
||||
JEQ matchLenEnd |
||||
MOVB (SI), AX |
||||
MOVB (DI), BX |
||||
CMPB AX, BX |
||||
JNE matchLenEnd |
||||
ADDQ $1, SI |
||||
ADDQ $1, DI |
||||
SUBQ $1, CX |
||||
JMP cmp1 |
||||
|
||||
matchLenEnd: |
||||
// Subtract off &b[0] to convert from &b[ret] to ret, and return. |
||||
SUBQ DX, DI |
||||
MOVQ DI, ret+56(FP) |
||||
RET |
||||
|
||||
// func histogram(b []byte, h []int32) |
||||
TEXT ·histogram(SB), 4, $0 |
||||
MOVQ b+0(FP), SI // SI: &b |
||||
MOVQ b_len+8(FP), R9 // R9: len(b) |
||||
MOVQ h+24(FP), DI // DI: Histogram |
||||
MOVQ R9, R8 |
||||
SHRQ $3, R8 |
||||
JZ hist1 |
||||
XORQ R11, R11 |
||||
|
||||
loop_hist8: |
||||
MOVQ (SI), R10 |
||||
|
||||
MOVB R10, R11 |
||||
INCL (DI)(R11*4) |
||||
SHRQ $8, R10 |
||||
|
||||
MOVB R10, R11 |
||||
INCL (DI)(R11*4) |
||||
SHRQ $8, R10 |
||||
|
||||
MOVB R10, R11 |
||||
INCL (DI)(R11*4) |
||||
SHRQ $8, R10 |
||||
|
||||
MOVB R10, R11 |
||||
INCL (DI)(R11*4) |
||||
SHRQ $8, R10 |
||||
|
||||
MOVB R10, R11 |
||||
INCL (DI)(R11*4) |
||||
SHRQ $8, R10 |
||||
|
||||
MOVB R10, R11 |
||||
INCL (DI)(R11*4) |
||||
SHRQ $8, R10 |
||||
|
||||
MOVB R10, R11 |
||||
INCL (DI)(R11*4) |
||||
SHRQ $8, R10 |
||||
|
||||
INCL (DI)(R10*4) |
||||
|
||||
ADDQ $8, SI |
||||
DECQ R8 |
||||
JNZ loop_hist8 |
||||
|
||||
hist1: |
||||
ANDQ $7, R9 |
||||
JZ end_hist |
||||
XORQ R10, R10 |
||||
|
||||
loop_hist1: |
||||
MOVB (SI), R10 |
||||
INCL (DI)(R10*4) |
||||
INCQ SI |
||||
DECQ R9 |
||||
JNZ loop_hist1 |
||||
|
||||
end_hist: |
||||
RET |
@ -1,35 +0,0 @@ |
||||
//+build !amd64 noasm appengine
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
|
||||
package flate |
||||
|
||||
func init() { |
||||
useSSE42 = false |
||||
} |
||||
|
||||
// crc32sse should never be called.
|
||||
func crc32sse(a []byte) uint32 { |
||||
panic("no assembler") |
||||
} |
||||
|
||||
// crc32sseAll should never be called.
|
||||
func crc32sseAll(a []byte, dst []uint32) { |
||||
panic("no assembler") |
||||
} |
||||
|
||||
// matchLenSSE4 should never be called.
|
||||
func matchLenSSE4(a, b []byte, max int) int { |
||||
panic("no assembler") |
||||
return 0 |
||||
} |
||||
|
||||
// histogram accumulates a histogram of b in h.
|
||||
//
|
||||
// len(h) must be >= 256, and h's elements must be all zeroes.
|
||||
func histogram(b []byte, h []int32) { |
||||
h = h[:256] |
||||
for _, t := range b { |
||||
h[t]++ |
||||
} |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,257 @@ |
||||
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
|
||||
// Modified for deflate by Klaus Post (c) 2015.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package flate |
||||
|
||||
import ( |
||||
"fmt" |
||||
"math/bits" |
||||
) |
||||
|
||||
type fastEnc interface { |
||||
Encode(dst *tokens, src []byte) |
||||
Reset() |
||||
} |
||||
|
||||
func newFastEnc(level int) fastEnc { |
||||
switch level { |
||||
case 1: |
||||
return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}} |
||||
case 2: |
||||
return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}} |
||||
case 3: |
||||
return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}} |
||||
case 4: |
||||
return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}} |
||||
case 5: |
||||
return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}} |
||||
case 6: |
||||
return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}} |
||||
default: |
||||
panic("invalid level specified") |
||||
} |
||||
} |
||||
|
||||
const ( |
||||
tableBits = 16 // Bits used in the table
|
||||
tableSize = 1 << tableBits // Size of the table
|
||||
tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
|
||||
baseMatchOffset = 1 // The smallest match offset
|
||||
baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5
|
||||
maxMatchOffset = 1 << 15 // The largest match offset
|
||||
|
||||
bTableBits = 18 // Bits used in the big tables
|
||||
bTableSize = 1 << bTableBits // Size of the table
|
||||
allocHistory = maxMatchOffset * 10 // Size to preallocate for history.
|
||||
bufferReset = (1 << 31) - allocHistory - maxStoreBlockSize // Reset the buffer offset when reaching this.
|
||||
) |
||||
|
||||
const ( |
||||
prime3bytes = 506832829 |
||||
prime4bytes = 2654435761 |
||||
prime5bytes = 889523592379 |
||||
prime6bytes = 227718039650203 |
||||
prime7bytes = 58295818150454627 |
||||
prime8bytes = 0xcf1bbcdcb7a56463 |
||||
) |
||||
|
||||
func load32(b []byte, i int) uint32 { |
||||
// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
|
||||
b = b[i:] |
||||
b = b[:4] |
||||
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
||||
} |
||||
|
||||
func load64(b []byte, i int) uint64 { |
||||
// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
|
||||
b = b[i:] |
||||
b = b[:8] |
||||
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | |
||||
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 |
||||
} |
||||
|
||||
func load3232(b []byte, i int32) uint32 { |
||||
// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
|
||||
b = b[i:] |
||||
b = b[:4] |
||||
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
||||
} |
||||
|
||||
func load6432(b []byte, i int32) uint64 { |
||||
// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
|
||||
b = b[i:] |
||||
b = b[:8] |
||||
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | |
||||
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 |
||||
} |
||||
|
||||
func hash(u uint32) uint32 { |
||||
return (u * 0x1e35a7bd) >> tableShift |
||||
} |
||||
|
||||
type tableEntry struct { |
||||
val uint32 |
||||
offset int32 |
||||
} |
||||
|
||||
// fastGen maintains the table for matches,
|
||||
// and the previous byte block for level 2.
|
||||
// This is the generic implementation.
|
||||
type fastGen struct { |
||||
hist []byte |
||||
cur int32 |
||||
} |
||||
|
||||
func (e *fastGen) addBlock(src []byte) int32 { |
||||
// check if we have space already
|
||||
if len(e.hist)+len(src) > cap(e.hist) { |
||||
if cap(e.hist) == 0 { |
||||
e.hist = make([]byte, 0, allocHistory) |
||||
} else { |
||||
if cap(e.hist) < maxMatchOffset*2 { |
||||
panic("unexpected buffer size") |
||||
} |
||||
// Move down
|
||||
offset := int32(len(e.hist)) - maxMatchOffset |
||||
copy(e.hist[0:maxMatchOffset], e.hist[offset:]) |
||||
e.cur += offset |
||||
e.hist = e.hist[:maxMatchOffset] |
||||
} |
||||
} |
||||
s := int32(len(e.hist)) |
||||
e.hist = append(e.hist, src...) |
||||
return s |
||||
} |
||||
|
||||
// hash4 returns the hash of u to fit in a hash table with h bits.
|
||||
// Preferably h should be a constant and should always be <32.
|
||||
func hash4u(u uint32, h uint8) uint32 { |
||||
return (u * prime4bytes) >> ((32 - h) & 31) |
||||
} |
||||
|
||||
type tableEntryPrev struct { |
||||
Cur tableEntry |
||||
Prev tableEntry |
||||
} |
||||
|
||||
// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
|
||||
// Preferably h should be a constant and should always be <32.
|
||||
func hash4x64(u uint64, h uint8) uint32 { |
||||
return (uint32(u) * prime4bytes) >> ((32 - h) & 31) |
||||
} |
||||
|
||||
// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
|
||||
// Preferably h should be a constant and should always be <64.
|
||||
func hash7(u uint64, h uint8) uint32 { |
||||
return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63)) |
||||
} |
||||
|
||||
// hash8 returns the hash of u to fit in a hash table with h bits.
|
||||
// Preferably h should be a constant and should always be <64.
|
||||
func hash8(u uint64, h uint8) uint32 { |
||||
return uint32((u * prime8bytes) >> ((64 - h) & 63)) |
||||
} |
||||
|
||||
// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
|
||||
// Preferably h should be a constant and should always be <64.
|
||||
func hash6(u uint64, h uint8) uint32 { |
||||
return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63)) |
||||
} |
||||
|
||||
// matchlen will return the match length between offsets and t in src.
|
||||
// The maximum length returned is maxMatchLength - 4.
|
||||
// It is assumed that s > t, that t >=0 and s < len(src).
|
||||
func (e *fastGen) matchlen(s, t int32, src []byte) int32 { |
||||
if debugDecode { |
||||
if t >= s { |
||||
panic(fmt.Sprint("t >=s:", t, s)) |
||||
} |
||||
if int(s) >= len(src) { |
||||
panic(fmt.Sprint("s >= len(src):", s, len(src))) |
||||
} |
||||
if t < 0 { |
||||
panic(fmt.Sprint("t < 0:", t)) |
||||
} |
||||
if s-t > maxMatchOffset { |
||||
panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) |
||||
} |
||||
} |
||||
s1 := int(s) + maxMatchLength - 4 |
||||
if s1 > len(src) { |
||||
s1 = len(src) |
||||
} |
||||
|
||||
// Extend the match to be as long as possible.
|
||||
return int32(matchLen(src[s:s1], src[t:])) |
||||
} |
||||
|
||||
// matchlenLong will return the match length between offsets and t in src.
|
||||
// It is assumed that s > t, that t >=0 and s < len(src).
|
||||
func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 { |
||||
if debugDecode { |
||||
if t >= s { |
||||
panic(fmt.Sprint("t >=s:", t, s)) |
||||
} |
||||
if int(s) >= len(src) { |
||||
panic(fmt.Sprint("s >= len(src):", s, len(src))) |
||||
} |
||||
if t < 0 { |
||||
panic(fmt.Sprint("t < 0:", t)) |
||||
} |
||||
if s-t > maxMatchOffset { |
||||
panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) |
||||
} |
||||
} |
||||
// Extend the match to be as long as possible.
|
||||
return int32(matchLen(src[s:], src[t:])) |
||||
} |
||||
|
||||
// Reset the encoding table.
|
||||
func (e *fastGen) Reset() { |
||||
if cap(e.hist) < int(maxMatchOffset*8) { |
||||
l := maxMatchOffset * 8 |
||||
// Make it at least 1MB.
|
||||
if l < 1<<20 { |
||||
l = 1 << 20 |
||||
} |
||||
e.hist = make([]byte, 0, l) |
||||
} |
||||
// We offset current position so everything will be out of reach
|
||||
e.cur += maxMatchOffset + int32(len(e.hist)) |
||||
e.hist = e.hist[:0] |
||||
} |
||||
|
||||
// matchLen returns the maximum length.
|
||||
// 'a' must be the shortest of the two.
|
||||
func matchLen(a, b []byte) int { |
||||
b = b[:len(a)] |
||||
var checked int |
||||
if len(a) > 4 { |
||||
// Try 4 bytes first
|
||||
if diff := load32(a, 0) ^ load32(b, 0); diff != 0 { |
||||
return bits.TrailingZeros32(diff) >> 3 |
||||
} |
||||
// Switch to 8 byte matching.
|
||||
checked = 4 |
||||
a = a[4:] |
||||
b = b[4:] |
||||
for len(a) >= 8 { |
||||
b = b[:len(a)] |
||||
if diff := load64(a, 0) ^ load64(b, 0); diff != 0 { |
||||
return checked + (bits.TrailingZeros64(diff) >> 3) |
||||
} |
||||
checked += 8 |
||||
a = a[8:] |
||||
b = b[8:] |
||||
} |
||||
} |
||||
b = b[:len(a)] |
||||
for i := range a { |
||||
if a[i] != b[i] { |
||||
return int(i) + checked |
||||
} |
||||
} |
||||
return len(a) + checked |
||||
} |
@ -0,0 +1,174 @@ |
||||
package flate |
||||
|
||||
// fastGen maintains the table for matches,
|
||||
// and the previous byte block for level 2.
|
||||
// This is the generic implementation.
|
||||
type fastEncL1 struct { |
||||
fastGen |
||||
table [tableSize]tableEntry |
||||
} |
||||
|
||||
// EncodeL1 uses a similar algorithm to level 1
|
||||
func (e *fastEncL1) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 12 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// Protect against e.cur wraparound.
|
||||
for e.cur >= bufferReset { |
||||
if len(e.hist) == 0 { |
||||
for i := range e.table[:] { |
||||
e.table[i] = tableEntry{} |
||||
} |
||||
e.cur = maxMatchOffset |
||||
break |
||||
} |
||||
// Shift down everything in the table that isn't already too far away.
|
||||
minOff := e.cur + int32(len(e.hist)) - maxMatchOffset |
||||
for i := range e.table[:] { |
||||
v := e.table[i].offset |
||||
if v <= minOff { |
||||
v = 0 |
||||
} else { |
||||
v = v - e.cur + maxMatchOffset |
||||
} |
||||
e.table[i].offset = v |
||||
} |
||||
e.cur = maxMatchOffset |
||||
} |
||||
|
||||
s := e.addBlock(src) |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
return |
||||
} |
||||
|
||||
// Override src
|
||||
src = e.hist |
||||
nextEmit := s |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
cv := load3232(src, s) |
||||
|
||||
for { |
||||
const skipLog = 5 |
||||
const doEvery = 2 |
||||
|
||||
nextS := s |
||||
var candidate tableEntry |
||||
for { |
||||
nextHash := hash(cv) |
||||
candidate = e.table[nextHash] |
||||
nextS = s + doEvery + (s-nextEmit)>>skipLog |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
now := load6432(src, nextS) |
||||
e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} |
||||
nextHash = hash(uint32(now)) |
||||
|
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset && cv == candidate.val { |
||||
e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} |
||||
break |
||||
} |
||||
|
||||
// Do one right away...
|
||||
cv = uint32(now) |
||||
s = nextS |
||||
nextS++ |
||||
candidate = e.table[nextHash] |
||||
now >>= 8 |
||||
e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} |
||||
|
||||
offset = s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset && cv == candidate.val { |
||||
e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} |
||||
break |
||||
} |
||||
cv = uint32(now) |
||||
s = nextS |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
t := candidate.offset - e.cur |
||||
l := e.matchlenLong(s+4, t+4, src) + 4 |
||||
|
||||
// Extend backwards
|
||||
for t > 0 && s > nextEmit && src[t-1] == src[s-1] { |
||||
s-- |
||||
t-- |
||||
l++ |
||||
} |
||||
if nextEmit < s { |
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
} |
||||
|
||||
// Save the match found
|
||||
dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) |
||||
s += l |
||||
nextEmit = s |
||||
if nextS >= s { |
||||
s = nextS + 1 |
||||
} |
||||
if s >= sLimit { |
||||
// Index first pair after match end.
|
||||
if int(s+l+4) < len(src) { |
||||
cv := load3232(src, s) |
||||
e.table[hash(cv)] = tableEntry{offset: s + e.cur, val: cv} |
||||
} |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-2 and at s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load6432(src, s-2) |
||||
o := e.cur + s - 2 |
||||
prevHash := hash(uint32(x)) |
||||
e.table[prevHash] = tableEntry{offset: o, val: uint32(x)} |
||||
x >>= 16 |
||||
currHash := hash(uint32(x)) |
||||
candidate = e.table[currHash] |
||||
e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x)} |
||||
|
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset > maxMatchOffset || uint32(x) != candidate.val { |
||||
cv = uint32(x >> 8) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
// If nothing was added, don't encode literals.
|
||||
if dst.n == 0 { |
||||
return |
||||
} |
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
} |
@ -0,0 +1,199 @@ |
||||
package flate |
||||
|
||||
// fastGen maintains the table for matches,
|
||||
// and the previous byte block for level 2.
|
||||
// This is the generic implementation.
|
||||
type fastEncL2 struct { |
||||
fastGen |
||||
table [bTableSize]tableEntry |
||||
} |
||||
|
||||
// EncodeL2 uses a similar algorithm to level 1, but is capable
|
||||
// of matching across blocks giving better compression at a small slowdown.
|
||||
func (e *fastEncL2) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 12 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// Protect against e.cur wraparound.
|
||||
for e.cur >= bufferReset { |
||||
if len(e.hist) == 0 { |
||||
for i := range e.table[:] { |
||||
e.table[i] = tableEntry{} |
||||
} |
||||
e.cur = maxMatchOffset |
||||
break |
||||
} |
||||
// Shift down everything in the table that isn't already too far away.
|
||||
minOff := e.cur + int32(len(e.hist)) - maxMatchOffset |
||||
for i := range e.table[:] { |
||||
v := e.table[i].offset |
||||
if v <= minOff { |
||||
v = 0 |
||||
} else { |
||||
v = v - e.cur + maxMatchOffset |
||||
} |
||||
e.table[i].offset = v |
||||
} |
||||
e.cur = maxMatchOffset |
||||
} |
||||
|
||||
s := e.addBlock(src) |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
return |
||||
} |
||||
|
||||
// Override src
|
||||
src = e.hist |
||||
nextEmit := s |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
cv := load3232(src, s) |
||||
for { |
||||
// When should we start skipping if we haven't found matches in a long while.
|
||||
const skipLog = 5 |
||||
const doEvery = 2 |
||||
|
||||
nextS := s |
||||
var candidate tableEntry |
||||
for { |
||||
nextHash := hash4u(cv, bTableBits) |
||||
s = nextS |
||||
nextS = s + doEvery + (s-nextEmit)>>skipLog |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
candidate = e.table[nextHash] |
||||
now := load6432(src, nextS) |
||||
e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} |
||||
nextHash = hash4u(uint32(now), bTableBits) |
||||
|
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset && cv == candidate.val { |
||||
e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} |
||||
break |
||||
} |
||||
|
||||
// Do one right away...
|
||||
cv = uint32(now) |
||||
s = nextS |
||||
nextS++ |
||||
candidate = e.table[nextHash] |
||||
now >>= 8 |
||||
e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} |
||||
|
||||
offset = s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset && cv == candidate.val { |
||||
break |
||||
} |
||||
cv = uint32(now) |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
t := candidate.offset - e.cur |
||||
l := e.matchlenLong(s+4, t+4, src) + 4 |
||||
|
||||
// Extend backwards
|
||||
for t > 0 && s > nextEmit && src[t-1] == src[s-1] { |
||||
s-- |
||||
t-- |
||||
l++ |
||||
} |
||||
if nextEmit < s { |
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
} |
||||
|
||||
dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) |
||||
s += l |
||||
nextEmit = s |
||||
if nextS >= s { |
||||
s = nextS + 1 |
||||
} |
||||
|
||||
if s >= sLimit { |
||||
// Index first pair after match end.
|
||||
if int(s+l+4) < len(src) { |
||||
cv := load3232(src, s) |
||||
e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur, val: cv} |
||||
} |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// Store every second hash in-between, but offset by 1.
|
||||
for i := s - l + 2; i < s-5; i += 7 { |
||||
x := load6432(src, int32(i)) |
||||
nextHash := hash4u(uint32(x), bTableBits) |
||||
e.table[nextHash] = tableEntry{offset: e.cur + i, val: uint32(x)} |
||||
// Skip one
|
||||
x >>= 16 |
||||
nextHash = hash4u(uint32(x), bTableBits) |
||||
e.table[nextHash] = tableEntry{offset: e.cur + i + 2, val: uint32(x)} |
||||
// Skip one
|
||||
x >>= 16 |
||||
nextHash = hash4u(uint32(x), bTableBits) |
||||
e.table[nextHash] = tableEntry{offset: e.cur + i + 4, val: uint32(x)} |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-2 to s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load6432(src, s-2) |
||||
o := e.cur + s - 2 |
||||
prevHash := hash4u(uint32(x), bTableBits) |
||||
prevHash2 := hash4u(uint32(x>>8), bTableBits) |
||||
e.table[prevHash] = tableEntry{offset: o, val: uint32(x)} |
||||
e.table[prevHash2] = tableEntry{offset: o + 1, val: uint32(x >> 8)} |
||||
currHash := hash4u(uint32(x>>16), bTableBits) |
||||
candidate = e.table[currHash] |
||||
e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x >> 16)} |
||||
|
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset > maxMatchOffset || uint32(x>>16) != candidate.val { |
||||
cv = uint32(x >> 24) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
// If nothing was added, don't encode literals.
|
||||
if dst.n == 0 { |
||||
return |
||||
} |
||||
|
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
} |
@ -0,0 +1,225 @@ |
||||
package flate |
||||
|
||||
// fastEncL3
|
||||
type fastEncL3 struct { |
||||
fastGen |
||||
table [tableSize]tableEntryPrev |
||||
} |
||||
|
||||
// Encode uses a similar algorithm to level 2, will check up to two candidates.
|
||||
func (e *fastEncL3) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 8 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// Protect against e.cur wraparound.
|
||||
for e.cur >= bufferReset { |
||||
if len(e.hist) == 0 { |
||||
for i := range e.table[:] { |
||||
e.table[i] = tableEntryPrev{} |
||||
} |
||||
e.cur = maxMatchOffset |
||||
break |
||||
} |
||||
// Shift down everything in the table that isn't already too far away.
|
||||
minOff := e.cur + int32(len(e.hist)) - maxMatchOffset |
||||
for i := range e.table[:] { |
||||
v := e.table[i] |
||||
if v.Cur.offset <= minOff { |
||||
v.Cur.offset = 0 |
||||
} else { |
||||
v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset |
||||
} |
||||
if v.Prev.offset <= minOff { |
||||
v.Prev.offset = 0 |
||||
} else { |
||||
v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset |
||||
} |
||||
e.table[i] = v |
||||
} |
||||
e.cur = maxMatchOffset |
||||
} |
||||
|
||||
s := e.addBlock(src) |
||||
|
||||
// Skip if too small.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
return |
||||
} |
||||
|
||||
// Override src
|
||||
src = e.hist |
||||
nextEmit := s |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
cv := load3232(src, s) |
||||
for { |
||||
const skipLog = 6 |
||||
nextS := s |
||||
var candidate tableEntry |
||||
for { |
||||
nextHash := hash(cv) |
||||
s = nextS |
||||
nextS = s + 1 + (s-nextEmit)>>skipLog |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
candidates := e.table[nextHash] |
||||
now := load3232(src, nextS) |
||||
e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} |
||||
|
||||
// Check both candidates
|
||||
candidate = candidates.Cur |
||||
offset := s - (candidate.offset - e.cur) |
||||
if cv == candidate.val { |
||||
if offset > maxMatchOffset { |
||||
cv = now |
||||
// Previous will also be invalid, we have nothing.
|
||||
continue |
||||
} |
||||
o2 := s - (candidates.Prev.offset - e.cur) |
||||
if cv != candidates.Prev.val || o2 > maxMatchOffset { |
||||
break |
||||
} |
||||
// Both match and are valid, pick longest.
|
||||
l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:]) |
||||
if l2 > l1 { |
||||
candidate = candidates.Prev |
||||
} |
||||
break |
||||
} else { |
||||
// We only check if value mismatches.
|
||||
// Offset will always be invalid in other cases.
|
||||
candidate = candidates.Prev |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset <= maxMatchOffset { |
||||
break |
||||
} |
||||
} |
||||
} |
||||
cv = now |
||||
} |
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
//
|
||||
t := candidate.offset - e.cur |
||||
l := e.matchlenLong(s+4, t+4, src) + 4 |
||||
|
||||
// Extend backwards
|
||||
for t > 0 && s > nextEmit && src[t-1] == src[s-1] { |
||||
s-- |
||||
t-- |
||||
l++ |
||||
} |
||||
if nextEmit < s { |
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
} |
||||
|
||||
dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) |
||||
s += l |
||||
nextEmit = s |
||||
if nextS >= s { |
||||
s = nextS + 1 |
||||
} |
||||
|
||||
if s >= sLimit { |
||||
t += l |
||||
// Index first pair after match end.
|
||||
if int(t+4) < len(src) && t > 0 { |
||||
cv := load3232(src, t) |
||||
nextHash := hash(cv) |
||||
e.table[nextHash] = tableEntryPrev{ |
||||
Prev: e.table[nextHash].Cur, |
||||
Cur: tableEntry{offset: e.cur + t, val: cv}, |
||||
} |
||||
} |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-3 to s.
|
||||
x := load6432(src, s-3) |
||||
prevHash := hash(uint32(x)) |
||||
e.table[prevHash] = tableEntryPrev{ |
||||
Prev: e.table[prevHash].Cur, |
||||
Cur: tableEntry{offset: e.cur + s - 3, val: uint32(x)}, |
||||
} |
||||
x >>= 8 |
||||
prevHash = hash(uint32(x)) |
||||
|
||||
e.table[prevHash] = tableEntryPrev{ |
||||
Prev: e.table[prevHash].Cur, |
||||
Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, |
||||
} |
||||
x >>= 8 |
||||
prevHash = hash(uint32(x)) |
||||
|
||||
e.table[prevHash] = tableEntryPrev{ |
||||
Prev: e.table[prevHash].Cur, |
||||
Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, |
||||
} |
||||
x >>= 8 |
||||
currHash := hash(uint32(x)) |
||||
candidates := e.table[currHash] |
||||
cv = uint32(x) |
||||
e.table[currHash] = tableEntryPrev{ |
||||
Prev: candidates.Cur, |
||||
Cur: tableEntry{offset: s + e.cur, val: cv}, |
||||
} |
||||
|
||||
// Check both candidates
|
||||
candidate = candidates.Cur |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset <= maxMatchOffset { |
||||
continue |
||||
} |
||||
} else { |
||||
// We only check if value mismatches.
|
||||
// Offset will always be invalid in other cases.
|
||||
candidate = candidates.Prev |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset <= maxMatchOffset { |
||||
continue |
||||
} |
||||
} |
||||
} |
||||
cv = uint32(x >> 8) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
// If nothing was added, don't encode literals.
|
||||
if dst.n == 0 { |
||||
return |
||||
} |
||||
|
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
} |
@ -0,0 +1,210 @@ |
||||
package flate |
||||
|
||||
import "fmt" |
||||
|
||||
type fastEncL4 struct { |
||||
fastGen |
||||
table [tableSize]tableEntry |
||||
bTable [tableSize]tableEntry |
||||
} |
||||
|
||||
func (e *fastEncL4) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 12 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// Protect against e.cur wraparound.
|
||||
for e.cur >= bufferReset { |
||||
if len(e.hist) == 0 { |
||||
for i := range e.table[:] { |
||||
e.table[i] = tableEntry{} |
||||
} |
||||
for i := range e.bTable[:] { |
||||
e.bTable[i] = tableEntry{} |
||||
} |
||||
e.cur = maxMatchOffset |
||||
break |
||||
} |
||||
// Shift down everything in the table that isn't already too far away.
|
||||
minOff := e.cur + int32(len(e.hist)) - maxMatchOffset |
||||
for i := range e.table[:] { |
||||
v := e.table[i].offset |
||||
if v <= minOff { |
||||
v = 0 |
||||
} else { |
||||
v = v - e.cur + maxMatchOffset |
||||
} |
||||
e.table[i].offset = v |
||||
} |
||||
for i := range e.bTable[:] { |
||||
v := e.bTable[i].offset |
||||
if v <= minOff { |
||||
v = 0 |
||||
} else { |
||||
v = v - e.cur + maxMatchOffset |
||||
} |
||||
e.bTable[i].offset = v |
||||
} |
||||
e.cur = maxMatchOffset |
||||
} |
||||
|
||||
s := e.addBlock(src) |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
return |
||||
} |
||||
|
||||
// Override src
|
||||
src = e.hist |
||||
nextEmit := s |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
cv := load6432(src, s) |
||||
for { |
||||
const skipLog = 6 |
||||
const doEvery = 1 |
||||
|
||||
nextS := s |
||||
var t int32 |
||||
for { |
||||
nextHashS := hash4x64(cv, tableBits) |
||||
nextHashL := hash7(cv, tableBits) |
||||
|
||||
s = nextS |
||||
nextS = s + doEvery + (s-nextEmit)>>skipLog |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
// Fetch a short+long candidate
|
||||
sCandidate := e.table[nextHashS] |
||||
lCandidate := e.bTable[nextHashL] |
||||
next := load6432(src, nextS) |
||||
entry := tableEntry{offset: s + e.cur, val: uint32(cv)} |
||||
e.table[nextHashS] = entry |
||||
e.bTable[nextHashL] = entry |
||||
|
||||
t = lCandidate.offset - e.cur |
||||
if s-t < maxMatchOffset && uint32(cv) == lCandidate.val { |
||||
// We got a long match. Use that.
|
||||
break |
||||
} |
||||
|
||||
t = sCandidate.offset - e.cur |
||||
if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { |
||||
// Found a 4 match...
|
||||
lCandidate = e.bTable[hash7(next, tableBits)] |
||||
|
||||
// If the next long is a candidate, check if we should use that instead...
|
||||
lOff := nextS - (lCandidate.offset - e.cur) |
||||
if lOff < maxMatchOffset && lCandidate.val == uint32(next) { |
||||
l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:]) |
||||
if l2 > l1 { |
||||
s = nextS |
||||
t = lCandidate.offset - e.cur |
||||
} |
||||
} |
||||
break |
||||
} |
||||
cv = next |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
l := e.matchlenLong(s+4, t+4, src) + 4 |
||||
|
||||
// Extend backwards
|
||||
for t > 0 && s > nextEmit && src[t-1] == src[s-1] { |
||||
s-- |
||||
t-- |
||||
l++ |
||||
} |
||||
if nextEmit < s { |
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
} |
||||
if false { |
||||
if t >= s { |
||||
panic("s-t") |
||||
} |
||||
if (s - t) > maxMatchOffset { |
||||
panic(fmt.Sprintln("mmo", t)) |
||||
} |
||||
if l < baseMatchLength { |
||||
panic("bml") |
||||
} |
||||
} |
||||
|
||||
dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) |
||||
s += l |
||||
nextEmit = s |
||||
if nextS >= s { |
||||
s = nextS + 1 |
||||
} |
||||
|
||||
if s >= sLimit { |
||||
// Index first pair after match end.
|
||||
if int(s+8) < len(src) { |
||||
cv := load6432(src, s) |
||||
e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)} |
||||
e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)} |
||||
} |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// Store every 3rd hash in-between
|
||||
if true { |
||||
i := nextS |
||||
if i < s-1 { |
||||
cv := load6432(src, i) |
||||
t := tableEntry{offset: i + e.cur, val: uint32(cv)} |
||||
t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1} |
||||
e.bTable[hash7(cv, tableBits)] = t |
||||
e.bTable[hash7(cv>>8, tableBits)] = t2 |
||||
e.table[hash4u(t2.val, tableBits)] = t2 |
||||
|
||||
i += 3 |
||||
for ; i < s-1; i += 3 { |
||||
cv := load6432(src, i) |
||||
t := tableEntry{offset: i + e.cur, val: uint32(cv)} |
||||
t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1} |
||||
e.bTable[hash7(cv, tableBits)] = t |
||||
e.bTable[hash7(cv>>8, tableBits)] = t2 |
||||
e.table[hash4u(t2.val, tableBits)] = t2 |
||||
} |
||||
} |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-1 and at s.
|
||||
x := load6432(src, s-1) |
||||
o := e.cur + s - 1 |
||||
prevHashS := hash4x64(x, tableBits) |
||||
prevHashL := hash7(x, tableBits) |
||||
e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)} |
||||
e.bTable[prevHashL] = tableEntry{offset: o, val: uint32(x)} |
||||
cv = x >> 8 |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
// If nothing was added, don't encode literals.
|
||||
if dst.n == 0 { |
||||
return |
||||
} |
||||
|
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
} |
@ -0,0 +1,276 @@ |
||||
package flate |
||||
|
||||
import "fmt" |
||||
|
||||
type fastEncL5 struct { |
||||
fastGen |
||||
table [tableSize]tableEntry |
||||
bTable [tableSize]tableEntryPrev |
||||
} |
||||
|
||||
func (e *fastEncL5) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 12 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// Protect against e.cur wraparound.
|
||||
for e.cur >= bufferReset { |
||||
if len(e.hist) == 0 { |
||||
for i := range e.table[:] { |
||||
e.table[i] = tableEntry{} |
||||
} |
||||
for i := range e.bTable[:] { |
||||
e.bTable[i] = tableEntryPrev{} |
||||
} |
||||
e.cur = maxMatchOffset |
||||
break |
||||
} |
||||
// Shift down everything in the table that isn't already too far away.
|
||||
minOff := e.cur + int32(len(e.hist)) - maxMatchOffset |
||||
for i := range e.table[:] { |
||||
v := e.table[i].offset |
||||
if v <= minOff { |
||||
v = 0 |
||||
} else { |
||||
v = v - e.cur + maxMatchOffset |
||||
} |
||||
e.table[i].offset = v |
||||
} |
||||
for i := range e.bTable[:] { |
||||
v := e.bTable[i] |
||||
if v.Cur.offset <= minOff { |
||||
v.Cur.offset = 0 |
||||
v.Prev.offset = 0 |
||||
} else { |
||||
v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset |
||||
if v.Prev.offset <= minOff { |
||||
v.Prev.offset = 0 |
||||
} else { |
||||
v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset |
||||
} |
||||
} |
||||
e.bTable[i] = v |
||||
} |
||||
e.cur = maxMatchOffset |
||||
} |
||||
|
||||
s := e.addBlock(src) |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
return |
||||
} |
||||
|
||||
// Override src
|
||||
src = e.hist |
||||
nextEmit := s |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
cv := load6432(src, s) |
||||
for { |
||||
const skipLog = 6 |
||||
const doEvery = 1 |
||||
|
||||
nextS := s |
||||
var l int32 |
||||
var t int32 |
||||
for { |
||||
nextHashS := hash4x64(cv, tableBits) |
||||
nextHashL := hash7(cv, tableBits) |
||||
|
||||
s = nextS |
||||
nextS = s + doEvery + (s-nextEmit)>>skipLog |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
// Fetch a short+long candidate
|
||||
sCandidate := e.table[nextHashS] |
||||
lCandidate := e.bTable[nextHashL] |
||||
next := load6432(src, nextS) |
||||
entry := tableEntry{offset: s + e.cur, val: uint32(cv)} |
||||
e.table[nextHashS] = entry |
||||
eLong := &e.bTable[nextHashL] |
||||
eLong.Cur, eLong.Prev = entry, eLong.Cur |
||||
|
||||
nextHashS = hash4x64(next, tableBits) |
||||
nextHashL = hash7(next, tableBits) |
||||
|
||||
t = lCandidate.Cur.offset - e.cur |
||||
if s-t < maxMatchOffset { |
||||
if uint32(cv) == lCandidate.Cur.val { |
||||
// Store the next match
|
||||
e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} |
||||
eLong := &e.bTable[nextHashL] |
||||
eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur |
||||
|
||||
t2 := lCandidate.Prev.offset - e.cur |
||||
if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { |
||||
l = e.matchlen(s+4, t+4, src) + 4 |
||||
ml1 := e.matchlen(s+4, t2+4, src) + 4 |
||||
if ml1 > l { |
||||
t = t2 |
||||
l = ml1 |
||||
break |
||||
} |
||||
} |
||||
break |
||||
} |
||||
t = lCandidate.Prev.offset - e.cur |
||||
if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { |
||||
// Store the next match
|
||||
e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} |
||||
eLong := &e.bTable[nextHashL] |
||||
eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur |
||||
break |
||||
} |
||||
} |
||||
|
||||
t = sCandidate.offset - e.cur |
||||
if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { |
||||
// Found a 4 match...
|
||||
l = e.matchlen(s+4, t+4, src) + 4 |
||||
lCandidate = e.bTable[nextHashL] |
||||
// Store the next match
|
||||
|
||||
e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} |
||||
eLong := &e.bTable[nextHashL] |
||||
eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur |
||||
|
||||
// If the next long is a candidate, use that...
|
||||
t2 := lCandidate.Cur.offset - e.cur |
||||
if nextS-t2 < maxMatchOffset { |
||||
if lCandidate.Cur.val == uint32(next) { |
||||
ml := e.matchlen(nextS+4, t2+4, src) + 4 |
||||
if ml > l { |
||||
t = t2 |
||||
s = nextS |
||||
l = ml |
||||
break |
||||
} |
||||
} |
||||
// If the previous long is a candidate, use that...
|
||||
t2 = lCandidate.Prev.offset - e.cur |
||||
if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) { |
||||
ml := e.matchlen(nextS+4, t2+4, src) + 4 |
||||
if ml > l { |
||||
t = t2 |
||||
s = nextS |
||||
l = ml |
||||
break |
||||
} |
||||
} |
||||
} |
||||
break |
||||
} |
||||
cv = next |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
if l == 0 { |
||||
l = e.matchlenLong(s+4, t+4, src) + 4 |
||||
} else if l == maxMatchLength { |
||||
l += e.matchlenLong(s+l, t+l, src) |
||||
} |
||||
// Extend backwards
|
||||
for t > 0 && s > nextEmit && src[t-1] == src[s-1] { |
||||
s-- |
||||
t-- |
||||
l++ |
||||
} |
||||
if nextEmit < s { |
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
} |
||||
if false { |
||||
if t >= s { |
||||
panic(fmt.Sprintln("s-t", s, t)) |
||||
} |
||||
if (s - t) > maxMatchOffset { |
||||
panic(fmt.Sprintln("mmo", s-t)) |
||||
} |
||||
if l < baseMatchLength { |
||||
panic("bml") |
||||
} |
||||
} |
||||
|
||||
dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) |
||||
s += l |
||||
nextEmit = s |
||||
if nextS >= s { |
||||
s = nextS + 1 |
||||
} |
||||
|
||||
if s >= sLimit { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// Store every 3rd hash in-between.
|
||||
if true { |
||||
const hashEvery = 3 |
||||
i := s - l + 1 |
||||
if i < s-1 { |
||||
cv := load6432(src, i) |
||||
t := tableEntry{offset: i + e.cur, val: uint32(cv)} |
||||
e.table[hash4x64(cv, tableBits)] = t |
||||
eLong := &e.bTable[hash7(cv, tableBits)] |
||||
eLong.Cur, eLong.Prev = t, eLong.Cur |
||||
|
||||
// Do an long at i+1
|
||||
cv >>= 8 |
||||
t = tableEntry{offset: t.offset + 1, val: uint32(cv)} |
||||
eLong = &e.bTable[hash7(cv, tableBits)] |
||||
eLong.Cur, eLong.Prev = t, eLong.Cur |
||||
|
||||
// We only have enough bits for a short entry at i+2
|
||||
cv >>= 8 |
||||
t = tableEntry{offset: t.offset + 1, val: uint32(cv)} |
||||
e.table[hash4x64(cv, tableBits)] = t |
||||
|
||||
// Skip one - otherwise we risk hitting 's'
|
||||
i += 4 |
||||
for ; i < s-1; i += hashEvery { |
||||
cv := load6432(src, i) |
||||
t := tableEntry{offset: i + e.cur, val: uint32(cv)} |
||||
t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)} |
||||
eLong := &e.bTable[hash7(cv, tableBits)] |
||||
eLong.Cur, eLong.Prev = t, eLong.Cur |
||||
e.table[hash4u(t2.val, tableBits)] = t2 |
||||
} |
||||
} |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-1 and at s.
|
||||
x := load6432(src, s-1) |
||||
o := e.cur + s - 1 |
||||
prevHashS := hash4x64(x, tableBits) |
||||
prevHashL := hash7(x, tableBits) |
||||
e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)} |
||||
eLong := &e.bTable[prevHashL] |
||||
eLong.Cur, eLong.Prev = tableEntry{offset: o, val: uint32(x)}, eLong.Cur |
||||
cv = x >> 8 |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
// If nothing was added, don't encode literals.
|
||||
if dst.n == 0 { |
||||
return |
||||
} |
||||
|
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
} |
@ -0,0 +1,279 @@ |
||||
package flate |
||||
|
||||
import "fmt" |
||||
|
||||
type fastEncL6 struct { |
||||
fastGen |
||||
table [tableSize]tableEntry |
||||
bTable [tableSize]tableEntryPrev |
||||
} |
||||
|
||||
func (e *fastEncL6) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 12 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// Protect against e.cur wraparound.
|
||||
for e.cur >= bufferReset { |
||||
if len(e.hist) == 0 { |
||||
for i := range e.table[:] { |
||||
e.table[i] = tableEntry{} |
||||
} |
||||
for i := range e.bTable[:] { |
||||
e.bTable[i] = tableEntryPrev{} |
||||
} |
||||
e.cur = maxMatchOffset |
||||
break |
||||
} |
||||
// Shift down everything in the table that isn't already too far away.
|
||||
minOff := e.cur + int32(len(e.hist)) - maxMatchOffset |
||||
for i := range e.table[:] { |
||||
v := e.table[i].offset |
||||
if v <= minOff { |
||||
v = 0 |
||||
} else { |
||||
v = v - e.cur + maxMatchOffset |
||||
} |
||||
e.table[i].offset = v |
||||
} |
||||
for i := range e.bTable[:] { |
||||
v := e.bTable[i] |
||||
if v.Cur.offset <= minOff { |
||||
v.Cur.offset = 0 |
||||
v.Prev.offset = 0 |
||||
} else { |
||||
v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset |
||||
if v.Prev.offset <= minOff { |
||||
v.Prev.offset = 0 |
||||
} else { |
||||
v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset |
||||
} |
||||
} |
||||
e.bTable[i] = v |
||||
} |
||||
e.cur = maxMatchOffset |
||||
} |
||||
|
||||
s := e.addBlock(src) |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
return |
||||
} |
||||
|
||||
// Override src
|
||||
src = e.hist |
||||
nextEmit := s |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
cv := load6432(src, s) |
||||
// Repeat MUST be > 1 and within range
|
||||
repeat := int32(1) |
||||
for { |
||||
const skipLog = 7 |
||||
const doEvery = 1 |
||||
|
||||
nextS := s |
||||
var l int32 |
||||
var t int32 |
||||
for { |
||||
nextHashS := hash4x64(cv, tableBits) |
||||
nextHashL := hash7(cv, tableBits) |
||||
s = nextS |
||||
nextS = s + doEvery + (s-nextEmit)>>skipLog |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
// Fetch a short+long candidate
|
||||
sCandidate := e.table[nextHashS] |
||||
lCandidate := e.bTable[nextHashL] |
||||
next := load6432(src, nextS) |
||||
entry := tableEntry{offset: s + e.cur, val: uint32(cv)} |
||||
e.table[nextHashS] = entry |
||||
eLong := &e.bTable[nextHashL] |
||||
eLong.Cur, eLong.Prev = entry, eLong.Cur |
||||
|
||||
// Calculate hashes of 'next'
|
||||
nextHashS = hash4x64(next, tableBits) |
||||
nextHashL = hash7(next, tableBits) |
||||
|
||||
t = lCandidate.Cur.offset - e.cur |
||||
if s-t < maxMatchOffset { |
||||
if uint32(cv) == lCandidate.Cur.val { |
||||
// Long candidate matches at least 4 bytes.
|
||||
|
||||
// Store the next match
|
||||
e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} |
||||
eLong := &e.bTable[nextHashL] |
||||
eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur |
||||
|
||||
// Check the previous long candidate as well.
|
||||
t2 := lCandidate.Prev.offset - e.cur |
||||
if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { |
||||
l = e.matchlen(s+4, t+4, src) + 4 |
||||
ml1 := e.matchlen(s+4, t2+4, src) + 4 |
||||
if ml1 > l { |
||||
t = t2 |
||||
l = ml1 |
||||
break |
||||
} |
||||
} |
||||
break |
||||
} |
||||
// Current value did not match, but check if previous long value does.
|
||||
t = lCandidate.Prev.offset - e.cur |
||||
if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { |
||||
// Store the next match
|
||||
e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} |
||||
eLong := &e.bTable[nextHashL] |
||||
eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur |
||||
break |
||||
} |
||||
} |
||||
|
||||
t = sCandidate.offset - e.cur |
||||
if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { |
||||
// Found a 4 match...
|
||||
l = e.matchlen(s+4, t+4, src) + 4 |
||||
|
||||
// Look up next long candidate (at nextS)
|
||||
lCandidate = e.bTable[nextHashL] |
||||
|
||||
// Store the next match
|
||||
e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} |
||||
eLong := &e.bTable[nextHashL] |
||||
eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur |
||||
|
||||
// Check repeat at s + repOff
|
||||
const repOff = 1 |
||||
t2 := s - repeat + repOff |
||||
if load3232(src, t2) == uint32(cv>>(8*repOff)) { |
||||
ml := e.matchlen(s+4+repOff, t2+4, src) + 4 |
||||
if ml > l { |
||||
t = t2 |
||||
l = ml |
||||
s += repOff |
||||
// Not worth checking more.
|
||||
break |
||||
} |
||||
} |
||||
|
||||
// If the next long is a candidate, use that...
|
||||
t2 = lCandidate.Cur.offset - e.cur |
||||
if nextS-t2 < maxMatchOffset { |
||||
if lCandidate.Cur.val == uint32(next) { |
||||
ml := e.matchlen(nextS+4, t2+4, src) + 4 |
||||
if ml > l { |
||||
t = t2 |
||||
s = nextS |
||||
l = ml |
||||
// This is ok, but check previous as well.
|
||||
} |
||||
} |
||||
// If the previous long is a candidate, use that...
|
||||
t2 = lCandidate.Prev.offset - e.cur |
||||
if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) { |
||||
ml := e.matchlen(nextS+4, t2+4, src) + 4 |
||||
if ml > l { |
||||
t = t2 |
||||
s = nextS |
||||
l = ml |
||||
break |
||||
} |
||||
} |
||||
} |
||||
break |
||||
} |
||||
cv = next |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
if l == 0 { |
||||
l = e.matchlenLong(s+4, t+4, src) + 4 |
||||
} else if l == maxMatchLength { |
||||
l += e.matchlenLong(s+l, t+l, src) |
||||
} |
||||
|
||||
// Extend backwards
|
||||
for t > 0 && s > nextEmit && src[t-1] == src[s-1] { |
||||
s-- |
||||
t-- |
||||
l++ |
||||
} |
||||
if nextEmit < s { |
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
} |
||||
if false { |
||||
if t >= s { |
||||
panic(fmt.Sprintln("s-t", s, t)) |
||||
} |
||||
if (s - t) > maxMatchOffset { |
||||
panic(fmt.Sprintln("mmo", s-t)) |
||||
} |
||||
if l < baseMatchLength { |
||||
panic("bml") |
||||
} |
||||
} |
||||
|
||||
dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) |
||||
repeat = s - t |
||||
s += l |
||||
nextEmit = s |
||||
if nextS >= s { |
||||
s = nextS + 1 |
||||
} |
||||
|
||||
if s >= sLimit { |
||||
// Index after match end.
|
||||
for i := nextS + 1; i < int32(len(src))-8; i += 2 { |
||||
cv := load6432(src, i) |
||||
e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur, val: uint32(cv)} |
||||
eLong := &e.bTable[hash7(cv, tableBits)] |
||||
eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur, val: uint32(cv)}, eLong.Cur |
||||
} |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// Store every long hash in-between and every second short.
|
||||
if true { |
||||
for i := nextS + 1; i < s-1; i += 2 { |
||||
cv := load6432(src, i) |
||||
t := tableEntry{offset: i + e.cur, val: uint32(cv)} |
||||
t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)} |
||||
eLong := &e.bTable[hash7(cv, tableBits)] |
||||
eLong2 := &e.bTable[hash7(cv>>8, tableBits)] |
||||
e.table[hash4x64(cv, tableBits)] = t |
||||
eLong.Cur, eLong.Prev = t, eLong.Cur |
||||
eLong2.Cur, eLong2.Prev = t2, eLong2.Cur |
||||
} |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-1 and at s.
|
||||
cv = load6432(src, s) |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
// If nothing was added, don't encode literals.
|
||||
if dst.n == 0 { |
||||
return |
||||
} |
||||
|
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
} |
@ -1,48 +0,0 @@ |
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package flate |
||||
|
||||
var reverseByte = [256]byte{ |
||||
0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, |
||||
0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, |
||||
0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, |
||||
0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, |
||||
0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, |
||||
0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, |
||||
0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, |
||||
0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, |
||||
0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, |
||||
0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, |
||||
0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, |
||||
0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, |
||||
0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, |
||||
0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, |
||||
0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, |
||||
0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, |
||||
0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, |
||||
0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, |
||||
0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, |
||||
0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, |
||||
0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, |
||||
0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, |
||||
0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, |
||||
0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, |
||||
0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, |
||||
0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, |
||||
0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, |
||||
0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, |
||||
0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, |
||||
0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, |
||||
0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, |
||||
0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, |
||||
} |
||||
|
||||
func reverseUint16(v uint16) uint16 { |
||||
return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8 |
||||
} |
||||
|
||||
func reverseBits(number uint16, bitLength byte) uint16 { |
||||
return reverseUint16(number << uint8(16-bitLength)) |
||||
} |
@ -1,856 +0,0 @@ |
||||
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
|
||||
// Modified for deflate by Klaus Post (c) 2015.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package flate |
||||
|
||||
// emitLiteral writes a literal chunk and returns the number of bytes written.
|
||||
func emitLiteral(dst *tokens, lit []byte) { |
||||
ol := int(dst.n) |
||||
for i, v := range lit { |
||||
dst.tokens[(i+ol)&maxStoreBlockSize] = token(v) |
||||
} |
||||
dst.n += uint16(len(lit)) |
||||
} |
||||
|
||||
// emitCopy writes a copy chunk and returns the number of bytes written.
|
||||
func emitCopy(dst *tokens, offset, length int) { |
||||
dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize)) |
||||
dst.n++ |
||||
} |
||||
|
||||
type snappyEnc interface { |
||||
Encode(dst *tokens, src []byte) |
||||
Reset() |
||||
} |
||||
|
||||
func newSnappy(level int) snappyEnc { |
||||
switch level { |
||||
case 1: |
||||
return &snappyL1{} |
||||
case 2: |
||||
return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}} |
||||
case 3: |
||||
return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}} |
||||
case 4: |
||||
return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}} |
||||
default: |
||||
panic("invalid level specified") |
||||
} |
||||
} |
||||
|
||||
const ( |
||||
tableBits = 14 // Bits used in the table
|
||||
tableSize = 1 << tableBits // Size of the table
|
||||
tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
|
||||
tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
|
||||
baseMatchOffset = 1 // The smallest match offset
|
||||
baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5
|
||||
maxMatchOffset = 1 << 15 // The largest match offset
|
||||
) |
||||
|
||||
func load32(b []byte, i int) uint32 { |
||||
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
||||
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
||||
} |
||||
|
||||
func load64(b []byte, i int) uint64 { |
||||
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
||||
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | |
||||
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 |
||||
} |
||||
|
||||
func hash(u uint32) uint32 { |
||||
return (u * 0x1e35a7bd) >> tableShift |
||||
} |
||||
|
||||
// snappyL1 encapsulates level 1 compression
|
||||
type snappyL1 struct{} |
||||
|
||||
func (e *snappyL1) Reset() {} |
||||
|
||||
func (e *snappyL1) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 16 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
return |
||||
} |
||||
|
||||
// Initialize the hash table.
|
||||
//
|
||||
// The table element type is uint16, as s < sLimit and sLimit < len(src)
|
||||
// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
|
||||
var table [tableSize]uint16 |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := len(src) - inputMargin |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
nextEmit := 0 |
||||
|
||||
// The encoded form must start with a literal, as there are no previous
|
||||
// bytes to copy, so we start looking for hash matches at s == 1.
|
||||
s := 1 |
||||
nextHash := hash(load32(src, s)) |
||||
|
||||
for { |
||||
// Copied from the C++ snappy implementation:
|
||||
//
|
||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
||||
// found, start looking only at every other byte. If 32 more bytes are
|
||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
||||
// is found, immediately go back to looking at every byte. This is a
|
||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
||||
// due to more bookkeeping, but for non-compressible data (such as
|
||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
||||
// data is incompressible and doesn't bother looking for matches
|
||||
// everywhere.
|
||||
//
|
||||
// The "skip" variable keeps track of how many bytes there are since
|
||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
||||
// the number of bytes to move ahead for each iteration.
|
||||
skip := 32 |
||||
|
||||
nextS := s |
||||
candidate := 0 |
||||
for { |
||||
s = nextS |
||||
bytesBetweenHashLookups := skip >> 5 |
||||
nextS = s + bytesBetweenHashLookups |
||||
skip += bytesBetweenHashLookups |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
candidate = int(table[nextHash&tableMask]) |
||||
table[nextHash&tableMask] = uint16(s) |
||||
nextHash = hash(load32(src, nextS)) |
||||
// TODO: < should be <=, and add a test for that.
|
||||
if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) { |
||||
break |
||||
} |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
base := s |
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
//
|
||||
// This is an inlined version of Snappy's:
|
||||
// s = extendMatch(src, candidate+4, s+4)
|
||||
s += 4 |
||||
s1 := base + maxMatchLength |
||||
if s1 > len(src) { |
||||
s1 = len(src) |
||||
} |
||||
a := src[s:s1] |
||||
b := src[candidate+4:] |
||||
b = b[:len(a)] |
||||
l := len(a) |
||||
for i := range a { |
||||
if a[i] != b[i] { |
||||
l = i |
||||
break |
||||
} |
||||
} |
||||
s += l |
||||
|
||||
// matchToken is flate's equivalent of Snappy's emitCopy.
|
||||
dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset)) |
||||
dst.n++ |
||||
nextEmit = s |
||||
if s >= sLimit { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-1 and at s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load64(src, s-1) |
||||
prevHash := hash(uint32(x >> 0)) |
||||
table[prevHash&tableMask] = uint16(s - 1) |
||||
currHash := hash(uint32(x >> 8)) |
||||
candidate = int(table[currHash&tableMask]) |
||||
table[currHash&tableMask] = uint16(s) |
||||
// TODO: >= should be >, and add a test for that.
|
||||
if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) { |
||||
nextHash = hash(uint32(x >> 16)) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if nextEmit < len(src) { |
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
} |
||||
|
||||
type tableEntry struct { |
||||
val uint32 |
||||
offset int32 |
||||
} |
||||
|
||||
func load3232(b []byte, i int32) uint32 { |
||||
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
||||
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
||||
} |
||||
|
||||
func load6432(b []byte, i int32) uint64 { |
||||
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
||||
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | |
||||
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 |
||||
} |
||||
|
||||
// snappyGen maintains the table for matches,
|
||||
// and the previous byte block for level 2.
|
||||
// This is the generic implementation.
|
||||
type snappyGen struct { |
||||
prev []byte |
||||
cur int32 |
||||
} |
||||
|
||||
// snappyGen maintains the table for matches,
|
||||
// and the previous byte block for level 2.
|
||||
// This is the generic implementation.
|
||||
type snappyL2 struct { |
||||
snappyGen |
||||
table [tableSize]tableEntry |
||||
} |
||||
|
||||
// EncodeL2 uses a similar algorithm to level 1, but is capable
|
||||
// of matching across blocks giving better compression at a small slowdown.
|
||||
func (e *snappyL2) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 16 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
|
||||
if e.cur > 1<<30 { |
||||
for i := range e.table { |
||||
e.table[i] = tableEntry{} |
||||
} |
||||
e.cur = maxStoreBlockSize |
||||
} |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
e.cur += maxStoreBlockSize |
||||
e.prev = e.prev[:0] |
||||
return |
||||
} |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
nextEmit := int32(0) |
||||
s := int32(0) |
||||
cv := load3232(src, s) |
||||
nextHash := hash(cv) |
||||
|
||||
for { |
||||
// Copied from the C++ snappy implementation:
|
||||
//
|
||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
||||
// found, start looking only at every other byte. If 32 more bytes are
|
||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
||||
// is found, immediately go back to looking at every byte. This is a
|
||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
||||
// due to more bookkeeping, but for non-compressible data (such as
|
||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
||||
// data is incompressible and doesn't bother looking for matches
|
||||
// everywhere.
|
||||
//
|
||||
// The "skip" variable keeps track of how many bytes there are since
|
||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
||||
// the number of bytes to move ahead for each iteration.
|
||||
skip := int32(32) |
||||
|
||||
nextS := s |
||||
var candidate tableEntry |
||||
for { |
||||
s = nextS |
||||
bytesBetweenHashLookups := skip >> 5 |
||||
nextS = s + bytesBetweenHashLookups |
||||
skip += bytesBetweenHashLookups |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
candidate = e.table[nextHash&tableMask] |
||||
now := load3232(src, nextS) |
||||
e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv} |
||||
nextHash = hash(now) |
||||
|
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset >= maxMatchOffset || cv != candidate.val { |
||||
// Out of range or not matched.
|
||||
cv = now |
||||
continue |
||||
} |
||||
break |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
//
|
||||
s += 4 |
||||
t := candidate.offset - e.cur + 4 |
||||
l := e.matchlen(s, t, src) |
||||
|
||||
// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
|
||||
dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) |
||||
dst.n++ |
||||
s += l |
||||
nextEmit = s |
||||
if s >= sLimit { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-1 and at s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load6432(src, s-1) |
||||
prevHash := hash(uint32(x)) |
||||
e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)} |
||||
x >>= 8 |
||||
currHash := hash(uint32(x)) |
||||
candidate = e.table[currHash&tableMask] |
||||
e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)} |
||||
|
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset >= maxMatchOffset || uint32(x) != candidate.val { |
||||
cv = uint32(x >> 8) |
||||
nextHash = hash(cv) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
e.cur += int32(len(src)) |
||||
e.prev = e.prev[:len(src)] |
||||
copy(e.prev, src) |
||||
} |
||||
|
||||
type tableEntryPrev struct { |
||||
Cur tableEntry |
||||
Prev tableEntry |
||||
} |
||||
|
||||
// snappyL3
|
||||
type snappyL3 struct { |
||||
snappyGen |
||||
table [tableSize]tableEntryPrev |
||||
} |
||||
|
||||
// Encode uses a similar algorithm to level 2, will check up to two candidates.
|
||||
func (e *snappyL3) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 16 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
|
||||
if e.cur > 1<<30 { |
||||
for i := range e.table { |
||||
e.table[i] = tableEntryPrev{} |
||||
} |
||||
e.cur = maxStoreBlockSize |
||||
} |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
e.cur += maxStoreBlockSize |
||||
e.prev = e.prev[:0] |
||||
return |
||||
} |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
nextEmit := int32(0) |
||||
s := int32(0) |
||||
cv := load3232(src, s) |
||||
nextHash := hash(cv) |
||||
|
||||
for { |
||||
// Copied from the C++ snappy implementation:
|
||||
//
|
||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
||||
// found, start looking only at every other byte. If 32 more bytes are
|
||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
||||
// is found, immediately go back to looking at every byte. This is a
|
||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
||||
// due to more bookkeeping, but for non-compressible data (such as
|
||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
||||
// data is incompressible and doesn't bother looking for matches
|
||||
// everywhere.
|
||||
//
|
||||
// The "skip" variable keeps track of how many bytes there are since
|
||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
||||
// the number of bytes to move ahead for each iteration.
|
||||
skip := int32(32) |
||||
|
||||
nextS := s |
||||
var candidate tableEntry |
||||
for { |
||||
s = nextS |
||||
bytesBetweenHashLookups := skip >> 5 |
||||
nextS = s + bytesBetweenHashLookups |
||||
skip += bytesBetweenHashLookups |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
candidates := e.table[nextHash&tableMask] |
||||
now := load3232(src, nextS) |
||||
e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} |
||||
nextHash = hash(now) |
||||
|
||||
// Check both candidates
|
||||
candidate = candidates.Cur |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset { |
||||
break |
||||
} |
||||
} else { |
||||
// We only check if value mismatches.
|
||||
// Offset will always be invalid in other cases.
|
||||
candidate = candidates.Prev |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset { |
||||
break |
||||
} |
||||
} |
||||
} |
||||
cv = now |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
//
|
||||
s += 4 |
||||
t := candidate.offset - e.cur + 4 |
||||
l := e.matchlen(s, t, src) |
||||
|
||||
// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
|
||||
dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) |
||||
dst.n++ |
||||
s += l |
||||
nextEmit = s |
||||
if s >= sLimit { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-2, s-1 and at s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load6432(src, s-2) |
||||
prevHash := hash(uint32(x)) |
||||
|
||||
e.table[prevHash&tableMask] = tableEntryPrev{ |
||||
Prev: e.table[prevHash&tableMask].Cur, |
||||
Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, |
||||
} |
||||
x >>= 8 |
||||
prevHash = hash(uint32(x)) |
||||
|
||||
e.table[prevHash&tableMask] = tableEntryPrev{ |
||||
Prev: e.table[prevHash&tableMask].Cur, |
||||
Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, |
||||
} |
||||
x >>= 8 |
||||
currHash := hash(uint32(x)) |
||||
candidates := e.table[currHash&tableMask] |
||||
cv = uint32(x) |
||||
e.table[currHash&tableMask] = tableEntryPrev{ |
||||
Prev: candidates.Cur, |
||||
Cur: tableEntry{offset: s + e.cur, val: cv}, |
||||
} |
||||
|
||||
// Check both candidates
|
||||
candidate = candidates.Cur |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset { |
||||
continue |
||||
} |
||||
} else { |
||||
// We only check if value mismatches.
|
||||
// Offset will always be invalid in other cases.
|
||||
candidate = candidates.Prev |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset { |
||||
continue |
||||
} |
||||
} |
||||
} |
||||
cv = uint32(x >> 8) |
||||
nextHash = hash(cv) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
e.cur += int32(len(src)) |
||||
e.prev = e.prev[:len(src)] |
||||
copy(e.prev, src) |
||||
} |
||||
|
||||
// snappyL4
|
||||
type snappyL4 struct { |
||||
snappyL3 |
||||
} |
||||
|
||||
// Encode uses a similar algorithm to level 3,
|
||||
// but will check up to two candidates if first isn't long enough.
|
||||
func (e *snappyL4) Encode(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 16 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
matchLenGood = 12 |
||||
) |
||||
|
||||
// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
|
||||
if e.cur > 1<<30 { |
||||
for i := range e.table { |
||||
e.table[i] = tableEntryPrev{} |
||||
} |
||||
e.cur = maxStoreBlockSize |
||||
} |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
e.cur += maxStoreBlockSize |
||||
e.prev = e.prev[:0] |
||||
return |
||||
} |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int32(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
nextEmit := int32(0) |
||||
s := int32(0) |
||||
cv := load3232(src, s) |
||||
nextHash := hash(cv) |
||||
|
||||
for { |
||||
// Copied from the C++ snappy implementation:
|
||||
//
|
||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
||||
// found, start looking only at every other byte. If 32 more bytes are
|
||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
||||
// is found, immediately go back to looking at every byte. This is a
|
||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
||||
// due to more bookkeeping, but for non-compressible data (such as
|
||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
||||
// data is incompressible and doesn't bother looking for matches
|
||||
// everywhere.
|
||||
//
|
||||
// The "skip" variable keeps track of how many bytes there are since
|
||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
||||
// the number of bytes to move ahead for each iteration.
|
||||
skip := int32(32) |
||||
|
||||
nextS := s |
||||
var candidate tableEntry |
||||
var candidateAlt tableEntry |
||||
for { |
||||
s = nextS |
||||
bytesBetweenHashLookups := skip >> 5 |
||||
nextS = s + bytesBetweenHashLookups |
||||
skip += bytesBetweenHashLookups |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
candidates := e.table[nextHash&tableMask] |
||||
now := load3232(src, nextS) |
||||
e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} |
||||
nextHash = hash(now) |
||||
|
||||
// Check both candidates
|
||||
candidate = candidates.Cur |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset { |
||||
offset = s - (candidates.Prev.offset - e.cur) |
||||
if cv == candidates.Prev.val && offset < maxMatchOffset { |
||||
candidateAlt = candidates.Prev |
||||
} |
||||
break |
||||
} |
||||
} else { |
||||
// We only check if value mismatches.
|
||||
// Offset will always be invalid in other cases.
|
||||
candidate = candidates.Prev |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset { |
||||
break |
||||
} |
||||
} |
||||
} |
||||
cv = now |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
//
|
||||
s += 4 |
||||
t := candidate.offset - e.cur + 4 |
||||
l := e.matchlen(s, t, src) |
||||
// Try alternative candidate if match length < matchLenGood.
|
||||
if l < matchLenGood-4 && candidateAlt.offset != 0 { |
||||
t2 := candidateAlt.offset - e.cur + 4 |
||||
l2 := e.matchlen(s, t2, src) |
||||
if l2 > l { |
||||
l = l2 |
||||
t = t2 |
||||
} |
||||
} |
||||
// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
|
||||
dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) |
||||
dst.n++ |
||||
s += l |
||||
nextEmit = s |
||||
if s >= sLimit { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-2, s-1 and at s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load6432(src, s-2) |
||||
prevHash := hash(uint32(x)) |
||||
|
||||
e.table[prevHash&tableMask] = tableEntryPrev{ |
||||
Prev: e.table[prevHash&tableMask].Cur, |
||||
Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, |
||||
} |
||||
x >>= 8 |
||||
prevHash = hash(uint32(x)) |
||||
|
||||
e.table[prevHash&tableMask] = tableEntryPrev{ |
||||
Prev: e.table[prevHash&tableMask].Cur, |
||||
Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, |
||||
} |
||||
x >>= 8 |
||||
currHash := hash(uint32(x)) |
||||
candidates := e.table[currHash&tableMask] |
||||
cv = uint32(x) |
||||
e.table[currHash&tableMask] = tableEntryPrev{ |
||||
Prev: candidates.Cur, |
||||
Cur: tableEntry{offset: s + e.cur, val: cv}, |
||||
} |
||||
|
||||
// Check both candidates
|
||||
candidate = candidates.Cur |
||||
candidateAlt = tableEntry{} |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset { |
||||
offset = s - (candidates.Prev.offset - e.cur) |
||||
if cv == candidates.Prev.val && offset < maxMatchOffset { |
||||
candidateAlt = candidates.Prev |
||||
} |
||||
continue |
||||
} |
||||
} else { |
||||
// We only check if value mismatches.
|
||||
// Offset will always be invalid in other cases.
|
||||
candidate = candidates.Prev |
||||
if cv == candidate.val { |
||||
offset := s - (candidate.offset - e.cur) |
||||
if offset < maxMatchOffset { |
||||
continue |
||||
} |
||||
} |
||||
} |
||||
cv = uint32(x >> 8) |
||||
nextHash = hash(cv) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
e.cur += int32(len(src)) |
||||
e.prev = e.prev[:len(src)] |
||||
copy(e.prev, src) |
||||
} |
||||
|
||||
func (e *snappyGen) matchlen(s, t int32, src []byte) int32 { |
||||
s1 := int(s) + maxMatchLength - 4 |
||||
if s1 > len(src) { |
||||
s1 = len(src) |
||||
} |
||||
|
||||
// If we are inside the current block
|
||||
if t >= 0 { |
||||
b := src[t:] |
||||
a := src[s:s1] |
||||
b = b[:len(a)] |
||||
// Extend the match to be as long as possible.
|
||||
for i := range a { |
||||
if a[i] != b[i] { |
||||
return int32(i) |
||||
} |
||||
} |
||||
return int32(len(a)) |
||||
} |
||||
|
||||
// We found a match in the previous block.
|
||||
tp := int32(len(e.prev)) + t |
||||
if tp < 0 { |
||||
return 0 |
||||
} |
||||
|
||||
// Extend the match to be as long as possible.
|
||||
a := src[s:s1] |
||||
b := e.prev[tp:] |
||||
if len(b) > len(a) { |
||||
b = b[:len(a)] |
||||
} |
||||
a = a[:len(b)] |
||||
for i := range b { |
||||
if a[i] != b[i] { |
||||
return int32(i) |
||||
} |
||||
} |
||||
n := int32(len(b)) |
||||
a = src[s+n : s1] |
||||
b = src[:len(a)] |
||||
for i := range a { |
||||
if a[i] != b[i] { |
||||
return int32(i) + n |
||||
} |
||||
} |
||||
return int32(len(a)) + n |
||||
} |
||||
|
||||
// Reset the encoding table.
|
||||
func (e *snappyGen) Reset() { |
||||
e.prev = e.prev[:0] |
||||
e.cur += maxMatchOffset + 1 |
||||
} |
@ -0,0 +1,252 @@ |
||||
package flate |
||||
|
||||
import ( |
||||
"io" |
||||
"math" |
||||
) |
||||
|
||||
const ( |
||||
maxStatelessBlock = math.MaxInt16 |
||||
|
||||
slTableBits = 13 |
||||
slTableSize = 1 << slTableBits |
||||
slTableShift = 32 - slTableBits |
||||
) |
||||
|
||||
type statelessWriter struct { |
||||
dst io.Writer |
||||
closed bool |
||||
} |
||||
|
||||
func (s *statelessWriter) Close() error { |
||||
if s.closed { |
||||
return nil |
||||
} |
||||
s.closed = true |
||||
// Emit EOF block
|
||||
return StatelessDeflate(s.dst, nil, true) |
||||
} |
||||
|
||||
func (s *statelessWriter) Write(p []byte) (n int, err error) { |
||||
err = StatelessDeflate(s.dst, p, false) |
||||
if err != nil { |
||||
return 0, err |
||||
} |
||||
return len(p), nil |
||||
} |
||||
|
||||
func (s *statelessWriter) Reset(w io.Writer) { |
||||
s.dst = w |
||||
s.closed = false |
||||
} |
||||
|
||||
// NewStatelessWriter will do compression but without maintaining any state
|
||||
// between Write calls.
|
||||
// There will be no memory kept between Write calls,
|
||||
// but compression and speed will be suboptimal.
|
||||
// Because of this, the size of actual Write calls will affect output size.
|
||||
func NewStatelessWriter(dst io.Writer) io.WriteCloser { |
||||
return &statelessWriter{dst: dst} |
||||
} |
||||
|
||||
// StatelessDeflate allows to compress directly to a Writer without retaining state.
|
||||
// When returning everything will be flushed.
|
||||
func StatelessDeflate(out io.Writer, in []byte, eof bool) error { |
||||
var dst tokens |
||||
bw := newHuffmanBitWriter(out) |
||||
if eof && len(in) == 0 { |
||||
// Just write an EOF block.
|
||||
// Could be faster...
|
||||
bw.writeStoredHeader(0, true) |
||||
bw.flush() |
||||
return bw.err |
||||
} |
||||
|
||||
for len(in) > 0 { |
||||
todo := in |
||||
if len(todo) > maxStatelessBlock { |
||||
todo = todo[:maxStatelessBlock] |
||||
} |
||||
in = in[len(todo):] |
||||
// Compress
|
||||
statelessEnc(&dst, todo) |
||||
isEof := eof && len(in) == 0 |
||||
|
||||
if dst.n == 0 { |
||||
bw.writeStoredHeader(len(todo), isEof) |
||||
if bw.err != nil { |
||||
return bw.err |
||||
} |
||||
bw.writeBytes(todo) |
||||
} else if int(dst.n) > len(todo)-len(todo)>>4 { |
||||
// If we removed less than 1/16th, huffman compress the block.
|
||||
bw.writeBlockHuff(isEof, todo, false) |
||||
} else { |
||||
bw.writeBlockDynamic(&dst, isEof, todo, false) |
||||
} |
||||
if bw.err != nil { |
||||
return bw.err |
||||
} |
||||
dst.Reset() |
||||
} |
||||
if !eof { |
||||
// Align.
|
||||
bw.writeStoredHeader(0, false) |
||||
} |
||||
bw.flush() |
||||
return bw.err |
||||
} |
||||
|
||||
func hashSL(u uint32) uint32 { |
||||
return (u * 0x1e35a7bd) >> slTableShift |
||||
} |
||||
|
||||
func load3216(b []byte, i int16) uint32 { |
||||
// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
|
||||
b = b[i:] |
||||
b = b[:4] |
||||
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
||||
} |
||||
|
||||
func load6416(b []byte, i int16) uint64 { |
||||
// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
|
||||
b = b[i:] |
||||
b = b[:8] |
||||
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | |
||||
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 |
||||
} |
||||
|
||||
func statelessEnc(dst *tokens, src []byte) { |
||||
const ( |
||||
inputMargin = 12 - 1 |
||||
minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
) |
||||
|
||||
type tableEntry struct { |
||||
offset int16 |
||||
} |
||||
|
||||
var table [slTableSize]tableEntry |
||||
|
||||
// This check isn't in the Snappy implementation, but there, the caller
|
||||
// instead of the callee handles this case.
|
||||
if len(src) < minNonLiteralBlockSize { |
||||
// We do not fill the token table.
|
||||
// This will be picked up by caller.
|
||||
dst.n = uint16(len(src)) |
||||
return |
||||
} |
||||
|
||||
s := int16(1) |
||||
nextEmit := int16(0) |
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := int16(len(src) - inputMargin) |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
cv := load3216(src, s) |
||||
|
||||
for { |
||||
const skipLog = 5 |
||||
const doEvery = 2 |
||||
|
||||
nextS := s |
||||
var candidate tableEntry |
||||
for { |
||||
nextHash := hashSL(cv) |
||||
candidate = table[nextHash] |
||||
nextS = s + doEvery + (s-nextEmit)>>skipLog |
||||
if nextS > sLimit || nextS <= 0 { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
now := load6416(src, nextS) |
||||
table[nextHash] = tableEntry{offset: s} |
||||
nextHash = hashSL(uint32(now)) |
||||
|
||||
if cv == load3216(src, candidate.offset) { |
||||
table[nextHash] = tableEntry{offset: nextS} |
||||
break |
||||
} |
||||
|
||||
// Do one right away...
|
||||
cv = uint32(now) |
||||
s = nextS |
||||
nextS++ |
||||
candidate = table[nextHash] |
||||
now >>= 8 |
||||
table[nextHash] = tableEntry{offset: s} |
||||
|
||||
if cv == load3216(src, candidate.offset) { |
||||
table[nextHash] = tableEntry{offset: nextS} |
||||
break |
||||
} |
||||
cv = uint32(now) |
||||
s = nextS |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
t := candidate.offset |
||||
l := int16(matchLen(src[s+4:], src[t+4:]) + 4) |
||||
|
||||
// Extend backwards
|
||||
for t > 0 && s > nextEmit && src[t-1] == src[s-1] { |
||||
s-- |
||||
t-- |
||||
l++ |
||||
} |
||||
if nextEmit < s { |
||||
emitLiteral(dst, src[nextEmit:s]) |
||||
} |
||||
|
||||
// Save the match found
|
||||
dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset)) |
||||
s += l |
||||
nextEmit = s |
||||
if nextS >= s { |
||||
s = nextS + 1 |
||||
} |
||||
if s >= sLimit { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-2 and at s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load6416(src, s-2) |
||||
o := s - 2 |
||||
prevHash := hashSL(uint32(x)) |
||||
table[prevHash] = tableEntry{offset: o} |
||||
x >>= 16 |
||||
currHash := hashSL(uint32(x)) |
||||
candidate = table[currHash] |
||||
table[currHash] = tableEntry{offset: o + 2} |
||||
|
||||
if uint32(x) != load3216(src, candidate.offset) { |
||||
cv = uint32(x >> 8) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if int(nextEmit) < len(src) { |
||||
// If nothing was added, don't encode literals.
|
||||
if dst.n == 0 { |
||||
return |
||||
} |
||||
emitLiteral(dst, src[nextEmit:]) |
||||
} |
||||
} |
@ -1,24 +0,0 @@ |
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects) |
||||
*.o |
||||
*.a |
||||
*.so |
||||
|
||||
# Folders |
||||
_obj |
||||
_test |
||||
|
||||
# Architecture specific extensions/prefixes |
||||
*.[568vq] |
||||
[568vq].out |
||||
|
||||
*.cgo1.go |
||||
*.cgo2.c |
||||
_cgo_defun.c |
||||
_cgo_gotypes.go |
||||
_cgo_export.* |
||||
|
||||
_testmain.go |
||||
|
||||
*.exe |
||||
*.test |
||||
*.prof |
@ -1,8 +0,0 @@ |
||||
language: go |
||||
|
||||
go: |
||||
- 1.3 |
||||
- 1.4 |
||||
- 1.5 |
||||
- 1.6 |
||||
- tip |
@ -1,22 +0,0 @@ |
||||
The MIT License (MIT) |
||||
|
||||
Copyright (c) 2015 Klaus Post |
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
of this software and associated documentation files (the "Software"), to deal |
||||
in the Software without restriction, including without limitation the rights |
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
copies of the Software, and to permit persons to whom the Software is |
||||
furnished to do so, subject to the following conditions: |
||||
|
||||
The above copyright notice and this permission notice shall be included in all |
||||
copies or substantial portions of the Software. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
SOFTWARE. |
||||
|
@ -1,145 +0,0 @@ |
||||
# cpuid |
||||
Package cpuid provides information about the CPU running the current program. |
||||
|
||||
CPU features are detected on startup, and kept for fast access through the life of the application. |
||||
Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use. |
||||
|
||||
You can access the CPU information by accessing the shared CPU variable of the cpuid library. |
||||
|
||||
Package home: https://github.com/klauspost/cpuid |
||||
|
||||
[![GoDoc][1]][2] [![Build Status][3]][4] |
||||
|
||||
[1]: https://godoc.org/github.com/klauspost/cpuid?status.svg |
||||
[2]: https://godoc.org/github.com/klauspost/cpuid |
||||
[3]: https://travis-ci.org/klauspost/cpuid.svg |
||||
[4]: https://travis-ci.org/klauspost/cpuid |
||||
|
||||
# features |
||||
## CPU Instructions |
||||
* **CMOV** (i686 CMOV) |
||||
* **NX** (NX (No-Execute) bit) |
||||
* **AMD3DNOW** (AMD 3DNOW) |
||||
* **AMD3DNOWEXT** (AMD 3DNowExt) |
||||
* **MMX** (standard MMX) |
||||
* **MMXEXT** (SSE integer functions or AMD MMX ext) |
||||
* **SSE** (SSE functions) |
||||
* **SSE2** (P4 SSE functions) |
||||
* **SSE3** (Prescott SSE3 functions) |
||||
* **SSSE3** (Conroe SSSE3 functions) |
||||
* **SSE4** (Penryn SSE4.1 functions) |
||||
* **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions) |
||||
* **SSE42** (Nehalem SSE4.2 functions) |
||||
* **AVX** (AVX functions) |
||||
* **AVX2** (AVX2 functions) |
||||
* **FMA3** (Intel FMA 3) |
||||
* **FMA4** (Bulldozer FMA4 functions) |
||||
* **XOP** (Bulldozer XOP functions) |
||||
* **F16C** (Half-precision floating-point conversion) |
||||
* **BMI1** (Bit Manipulation Instruction Set 1) |
||||
* **BMI2** (Bit Manipulation Instruction Set 2) |
||||
* **TBM** (AMD Trailing Bit Manipulation) |
||||
* **LZCNT** (LZCNT instruction) |
||||
* **POPCNT** (POPCNT instruction) |
||||
* **AESNI** (Advanced Encryption Standard New Instructions) |
||||
* **CLMUL** (Carry-less Multiplication) |
||||
* **HTT** (Hyperthreading (enabled)) |
||||
* **HLE** (Hardware Lock Elision) |
||||
* **RTM** (Restricted Transactional Memory) |
||||
* **RDRAND** (RDRAND instruction is available) |
||||
* **RDSEED** (RDSEED instruction is available) |
||||
* **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions)) |
||||
* **SHA** (Intel SHA Extensions) |
||||
* **AVX512F** (AVX-512 Foundation) |
||||
* **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions) |
||||
* **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions) |
||||
* **AVX512PF** (AVX-512 Prefetch Instructions) |
||||
* **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions) |
||||
* **AVX512CD** (AVX-512 Conflict Detection Instructions) |
||||
* **AVX512BW** (AVX-512 Byte and Word Instructions) |
||||
* **AVX512VL** (AVX-512 Vector Length Extensions) |
||||
* **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions) |
||||
* **MPX** (Intel MPX (Memory Protection Extensions)) |
||||
* **ERMS** (Enhanced REP MOVSB/STOSB) |
||||
* **RDTSCP** (RDTSCP Instruction) |
||||
* **CX16** (CMPXCHG16B Instruction) |
||||
* **SGX** (Software Guard Extensions, with activation details) |
||||
|
||||
## Performance |
||||
* **RDTSCP()** Returns current cycle count. Can be used for benchmarking. |
||||
* **SSE2SLOW** (SSE2 is supported, but usually not faster) |
||||
* **SSE3SLOW** (SSE3 is supported, but usually not faster) |
||||
* **ATOM** (Atom processor, some SSSE3 instructions are slower) |
||||
* **Cache line** (Probable size of a cache line). |
||||
* **L1, L2, L3 Cache size** on newer Intel/AMD CPUs. |
||||
|
||||
## Cpu Vendor/VM |
||||
* **Intel** |
||||
* **AMD** |
||||
* **VIA** |
||||
* **Transmeta** |
||||
* **NSC** |
||||
* **KVM** (Kernel-based Virtual Machine) |
||||
* **MSVM** (Microsoft Hyper-V or Windows Virtual PC) |
||||
* **VMware** |
||||
* **XenHVM** |
||||
|
||||
# installing |
||||
|
||||
```go get github.com/klauspost/cpuid``` |
||||
|
||||
# example |
||||
|
||||
```Go |
||||
package main |
||||
|
||||
import ( |
||||
"fmt" |
||||
"github.com/klauspost/cpuid" |
||||
) |
||||
|
||||
func main() { |
||||
// Print basic CPU information: |
||||
fmt.Println("Name:", cpuid.CPU.BrandName) |
||||
fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores) |
||||
fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore) |
||||
fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores) |
||||
fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model) |
||||
fmt.Println("Features:", cpuid.CPU.Features) |
||||
fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine) |
||||
fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes") |
||||
fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes") |
||||
fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes") |
||||
fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes") |
||||
|
||||
// Test if we have a specific feature: |
||||
if cpuid.CPU.SSE() { |
||||
fmt.Println("We have Streaming SIMD Extensions") |
||||
} |
||||
} |
||||
``` |
||||
|
||||
Sample output: |
||||
``` |
||||
>go run main.go |
||||
Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz |
||||
PhysicalCores: 2 |
||||
ThreadsPerCore: 2 |
||||
LogicalCores: 4 |
||||
Family 6 Model: 42 |
||||
Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL |
||||
Cacheline bytes: 64 |
||||
We have Streaming SIMD Extensions |
||||
``` |
||||
|
||||
# private package |
||||
|
||||
In the "private" folder you can find an autogenerated version of the library you can include in your own packages. |
||||
|
||||
For this purpose all exports are removed, and functions and constants are lowercased. |
||||
|
||||
This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages. |
||||
|
||||
# license |
||||
|
||||
This code is published under an MIT license. See LICENSE file for more information. |
File diff suppressed because it is too large
Load Diff
@ -1,42 +0,0 @@ |
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. |
||||
|
||||
// +build 386,!gccgo |
||||
|
||||
// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) |
||||
TEXT ·asmCpuid(SB), 7, $0 |
||||
XORL CX, CX |
||||
MOVL op+0(FP), AX |
||||
CPUID |
||||
MOVL AX, eax+4(FP) |
||||
MOVL BX, ebx+8(FP) |
||||
MOVL CX, ecx+12(FP) |
||||
MOVL DX, edx+16(FP) |
||||
RET |
||||
|
||||
// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) |
||||
TEXT ·asmCpuidex(SB), 7, $0 |
||||
MOVL op+0(FP), AX |
||||
MOVL op2+4(FP), CX |
||||
CPUID |
||||
MOVL AX, eax+8(FP) |
||||
MOVL BX, ebx+12(FP) |
||||
MOVL CX, ecx+16(FP) |
||||
MOVL DX, edx+20(FP) |
||||
RET |
||||
|
||||
// func xgetbv(index uint32) (eax, edx uint32) |
||||
TEXT ·asmXgetbv(SB), 7, $0 |
||||
MOVL index+0(FP), CX |
||||
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
|
||||
MOVL AX, eax+4(FP) |
||||
MOVL DX, edx+8(FP) |
||||
RET |
||||
|
||||
// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) |
||||
TEXT ·asmRdtscpAsm(SB), 7, $0 |
||||
BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
|
||||
MOVL AX, eax+0(FP) |
||||
MOVL BX, ebx+4(FP) |
||||
MOVL CX, ecx+8(FP) |
||||
MOVL DX, edx+12(FP) |
||||
RET |
@ -1,42 +0,0 @@ |
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. |
||||
|
||||
//+build amd64,!gccgo |
||||
|
||||
// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) |
||||
TEXT ·asmCpuid(SB), 7, $0 |
||||
XORQ CX, CX |
||||
MOVL op+0(FP), AX |
||||
CPUID |
||||
MOVL AX, eax+8(FP) |
||||
MOVL BX, ebx+12(FP) |
||||
MOVL CX, ecx+16(FP) |
||||
MOVL DX, edx+20(FP) |
||||
RET |
||||
|
||||
// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) |
||||
TEXT ·asmCpuidex(SB), 7, $0 |
||||
MOVL op+0(FP), AX |
||||
MOVL op2+4(FP), CX |
||||
CPUID |
||||
MOVL AX, eax+8(FP) |
||||
MOVL BX, ebx+12(FP) |
||||
MOVL CX, ecx+16(FP) |
||||
MOVL DX, edx+20(FP) |
||||
RET |
||||
|
||||
// func asmXgetbv(index uint32) (eax, edx uint32) |
||||
TEXT ·asmXgetbv(SB), 7, $0 |
||||
MOVL index+0(FP), CX |
||||
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
|
||||
MOVL AX, eax+8(FP) |
||||
MOVL DX, edx+12(FP) |
||||
RET |
||||
|
||||
// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) |
||||
TEXT ·asmRdtscpAsm(SB), 7, $0 |
||||
BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
|
||||
MOVL AX, eax+0(FP) |
||||
MOVL BX, ebx+4(FP) |
||||
MOVL CX, ecx+8(FP) |
||||
MOVL DX, edx+12(FP) |
||||
RET |
@ -1,17 +0,0 @@ |
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
|
||||
|
||||
// +build 386,!gccgo amd64,!gccgo
|
||||
|
||||
package cpuid |
||||
|
||||
func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) |
||||
func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) |
||||
func asmXgetbv(index uint32) (eax, edx uint32) |
||||
func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) |
||||
|
||||
func initCPU() { |
||||
cpuid = asmCpuid |
||||
cpuidex = asmCpuidex |
||||
xgetbv = asmXgetbv |
||||
rdtscpAsm = asmRdtscpAsm |
||||
} |
@ -1,23 +0,0 @@ |
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
|
||||
|
||||
// +build !amd64,!386 gccgo
|
||||
|
||||
package cpuid |
||||
|
||||
func initCPU() { |
||||
cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) { |
||||
return 0, 0, 0, 0 |
||||
} |
||||
|
||||
cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) { |
||||
return 0, 0, 0, 0 |
||||
} |
||||
|
||||
xgetbv = func(index uint32) (eax, edx uint32) { |
||||
return 0, 0 |
||||
} |
||||
|
||||
rdtscpAsm = func() (eax, ebx, ecx, edx uint32) { |
||||
return 0, 0, 0, 0 |
||||
} |
||||
} |
@ -1,3 +0,0 @@ |
||||
package cpuid |
||||
|
||||
//go:generate go run private-gen.go
|
@ -1,24 +0,0 @@ |
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects) |
||||
*.o |
||||
*.a |
||||
*.so |
||||
|
||||
# Folders |
||||
_obj |
||||
_test |
||||
|
||||
# Architecture specific extensions/prefixes |
||||
*.[568vq] |
||||
[568vq].out |
||||
|
||||
*.cgo1.go |
||||
*.cgo2.c |
||||
_cgo_defun.c |
||||
_cgo_gotypes.go |
||||
_cgo_export.* |
||||
|
||||
_testmain.go |
||||
|
||||
*.exe |
||||
*.test |
||||
*.prof |
@ -1,13 +0,0 @@ |
||||
language: go |
||||
|
||||
go: |
||||
- 1.3 |
||||
- 1.4 |
||||
- 1.5 |
||||
- 1.6 |
||||
- 1.7 |
||||
- tip |
||||
|
||||
script: |
||||
- go test -v . |
||||
- go test -v -race . |
@ -1,28 +0,0 @@ |
||||
Copyright (c) 2012 The Go Authors. All rights reserved. |
||||
Copyright (c) 2015 Klaus Post |
||||
|
||||
Redistribution and use in source and binary forms, with or without |
||||
modification, are permitted provided that the following conditions are |
||||
met: |
||||
|
||||
* Redistributions of source code must retain the above copyright |
||||
notice, this list of conditions and the following disclaimer. |
||||
* Redistributions in binary form must reproduce the above |
||||
copyright notice, this list of conditions and the following disclaimer |
||||
in the documentation and/or other materials provided with the |
||||
distribution. |
||||
* Neither the name of Google Inc. nor the names of its |
||||
contributors may be used to endorse or promote products derived from |
||||
this software without specific prior written permission. |
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
@ -1,87 +0,0 @@ |
||||
# crc32 |
||||
CRC32 hash with x64 optimizations |
||||
|
||||
This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup. |
||||
|
||||
[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32) |
||||
|
||||
# usage |
||||
|
||||
Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer. |
||||
|
||||
Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go. |
||||
|
||||
# changes |
||||
* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match. |
||||
* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable. |
||||
|
||||
|
||||
# performance |
||||
|
||||
For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back. |
||||
|
||||
|
||||
For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction: |
||||
``` |
||||
benchmark old ns/op new ns/op delta |
||||
BenchmarkCrc32KB 99955 10258 -89.74% |
||||
|
||||
benchmark old MB/s new MB/s speedup |
||||
BenchmarkCrc32KB 327.83 3194.20 9.74x |
||||
``` |
||||
|
||||
For other tables and "CLMUL" capable machines the performance is the same as the standard library. |
||||
|
||||
Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled. |
||||
|
||||
``` |
||||
Std: Standard Go 1.5 library |
||||
Crc: Indicates IEEE type CRC. |
||||
40B: Size of each slice encoded. |
||||
NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine). |
||||
Castagnoli: Castagnoli CRC type. |
||||
|
||||
BenchmarkStdCrc40B-4 10000000 158 ns/op 252.88 MB/s |
||||
BenchmarkCrc40BNoAsm-4 20000000 105 ns/op 377.38 MB/s (slice8) |
||||
BenchmarkCrc40B-4 20000000 105 ns/op 378.77 MB/s (slice8) |
||||
|
||||
BenchmarkStdCrc1KB-4 500000 3604 ns/op 284.10 MB/s |
||||
BenchmarkCrc1KBNoAsm-4 1000000 1463 ns/op 699.79 MB/s (slice8) |
||||
BenchmarkCrc1KB-4 3000000 396 ns/op 2583.69 MB/s (asm) |
||||
|
||||
BenchmarkStdCrc8KB-4 200000 11417 ns/op 717.48 MB/s (slice8) |
||||
BenchmarkCrc8KBNoAsm-4 200000 11317 ns/op 723.85 MB/s (slice8) |
||||
BenchmarkCrc8KB-4 500000 2919 ns/op 2805.73 MB/s (asm) |
||||
|
||||
BenchmarkStdCrc32KB-4 30000 45749 ns/op 716.24 MB/s (slice8) |
||||
BenchmarkCrc32KBNoAsm-4 30000 45109 ns/op 726.42 MB/s (slice8) |
||||
BenchmarkCrc32KB-4 100000 11497 ns/op 2850.09 MB/s (asm) |
||||
|
||||
BenchmarkStdNoAsmCastagnol40B-4 10000000 161 ns/op 246.94 MB/s |
||||
BenchmarkStdCastagnoli40B-4 50000000 28.4 ns/op 1410.69 MB/s (asm) |
||||
BenchmarkCastagnoli40BNoAsm-4 20000000 100 ns/op 398.01 MB/s (slice8) |
||||
BenchmarkCastagnoli40B-4 50000000 28.2 ns/op 1419.54 MB/s (asm) |
||||
|
||||
BenchmarkStdNoAsmCastagnoli1KB-4 500000 3622 ns/op 282.67 MB/s |
||||
BenchmarkStdCastagnoli1KB-4 10000000 144 ns/op 7099.78 MB/s (asm) |
||||
BenchmarkCastagnoli1KBNoAsm-4 1000000 1475 ns/op 694.14 MB/s (slice8) |
||||
BenchmarkCastagnoli1KB-4 10000000 146 ns/op 6993.35 MB/s (asm) |
||||
|
||||
BenchmarkStdNoAsmCastagnoli8KB-4 50000 28781 ns/op 284.63 MB/s |
||||
BenchmarkStdCastagnoli8KB-4 1000000 1029 ns/op 7957.89 MB/s (asm) |
||||
BenchmarkCastagnoli8KBNoAsm-4 200000 11410 ns/op 717.94 MB/s (slice8) |
||||
BenchmarkCastagnoli8KB-4 1000000 1000 ns/op 8188.71 MB/s (asm) |
||||
|
||||
BenchmarkStdNoAsmCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s |
||||
BenchmarkStdCastagnoli32KB-4 300000 4065 ns/op 8059.13 MB/s (asm) |
||||
BenchmarkCastagnoli32KBNoAsm-4 30000 45171 ns/op 725.41 MB/s (slice8) |
||||
BenchmarkCastagnoli32KB-4 500000 4077 ns/op 8035.89 MB/s (asm) |
||||
``` |
||||
|
||||
The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library. |
||||
|
||||
However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7. |
||||
|
||||
# license |
||||
|
||||
Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions. |
@ -1,207 +0,0 @@ |
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
|
||||
// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
|
||||
// information.
|
||||
//
|
||||
// Polynomials are represented in LSB-first form also known as reversed representation.
|
||||
//
|
||||
// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
|
||||
// for information.
|
||||
package crc32 |
||||
|
||||
import ( |
||||
"hash" |
||||
"sync" |
||||
) |
||||
|
||||
// The size of a CRC-32 checksum in bytes.
|
||||
const Size = 4 |
||||
|
||||
// Predefined polynomials.
|
||||
const ( |
||||
// IEEE is by far and away the most common CRC-32 polynomial.
|
||||
// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
|
||||
IEEE = 0xedb88320 |
||||
|
||||
// Castagnoli's polynomial, used in iSCSI.
|
||||
// Has better error detection characteristics than IEEE.
|
||||
// http://dx.doi.org/10.1109/26.231911
|
||||
Castagnoli = 0x82f63b78 |
||||
|
||||
// Koopman's polynomial.
|
||||
// Also has better error detection characteristics than IEEE.
|
||||
// http://dx.doi.org/10.1109/DSN.2002.1028931
|
||||
Koopman = 0xeb31d82e |
||||
) |
||||
|
||||
// Table is a 256-word table representing the polynomial for efficient processing.
|
||||
type Table [256]uint32 |
||||
|
||||
// This file makes use of functions implemented in architecture-specific files.
|
||||
// The interface that they implement is as follows:
|
||||
//
|
||||
// // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
|
||||
// // algorithm is available.
|
||||
// archAvailableIEEE() bool
|
||||
//
|
||||
// // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
|
||||
// // It can only be called if archAvailableIEEE() returns true.
|
||||
// archInitIEEE()
|
||||
//
|
||||
// // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
|
||||
// // archInitIEEE() was previously called.
|
||||
// archUpdateIEEE(crc uint32, p []byte) uint32
|
||||
//
|
||||
// // archAvailableCastagnoli reports whether an architecture-specific
|
||||
// // CRC32-C algorithm is available.
|
||||
// archAvailableCastagnoli() bool
|
||||
//
|
||||
// // archInitCastagnoli initializes the architecture-specific CRC32-C
|
||||
// // algorithm. It can only be called if archAvailableCastagnoli() returns
|
||||
// // true.
|
||||
// archInitCastagnoli()
|
||||
//
|
||||
// // archUpdateCastagnoli updates the given CRC32-C. It can only be called
|
||||
// // if archInitCastagnoli() was previously called.
|
||||
// archUpdateCastagnoli(crc uint32, p []byte) uint32
|
||||
|
||||
// castagnoliTable points to a lazily initialized Table for the Castagnoli
|
||||
// polynomial. MakeTable will always return this value when asked to make a
|
||||
// Castagnoli table so we can compare against it to find when the caller is
|
||||
// using this polynomial.
|
||||
var castagnoliTable *Table |
||||
var castagnoliTable8 *slicing8Table |
||||
var castagnoliArchImpl bool |
||||
var updateCastagnoli func(crc uint32, p []byte) uint32 |
||||
var castagnoliOnce sync.Once |
||||
|
||||
func castagnoliInit() { |
||||
castagnoliTable = simpleMakeTable(Castagnoli) |
||||
castagnoliArchImpl = archAvailableCastagnoli() |
||||
|
||||
if castagnoliArchImpl { |
||||
archInitCastagnoli() |
||||
updateCastagnoli = archUpdateCastagnoli |
||||
} else { |
||||
// Initialize the slicing-by-8 table.
|
||||
castagnoliTable8 = slicingMakeTable(Castagnoli) |
||||
updateCastagnoli = func(crc uint32, p []byte) uint32 { |
||||
return slicingUpdate(crc, castagnoliTable8, p) |
||||
} |
||||
} |
||||
} |
||||
|
||||
// IEEETable is the table for the IEEE polynomial.
|
||||
var IEEETable = simpleMakeTable(IEEE) |
||||
|
||||
// ieeeTable8 is the slicing8Table for IEEE
|
||||
var ieeeTable8 *slicing8Table |
||||
var ieeeArchImpl bool |
||||
var updateIEEE func(crc uint32, p []byte) uint32 |
||||
var ieeeOnce sync.Once |
||||
|
||||
func ieeeInit() { |
||||
ieeeArchImpl = archAvailableIEEE() |
||||
|
||||
if ieeeArchImpl { |
||||
archInitIEEE() |
||||
updateIEEE = archUpdateIEEE |
||||
} else { |
||||
// Initialize the slicing-by-8 table.
|
||||
ieeeTable8 = slicingMakeTable(IEEE) |
||||
updateIEEE = func(crc uint32, p []byte) uint32 { |
||||
return slicingUpdate(crc, ieeeTable8, p) |
||||
} |
||||
} |
||||
} |
||||
|
||||
// MakeTable returns a Table constructed from the specified polynomial.
|
||||
// The contents of this Table must not be modified.
|
||||
func MakeTable(poly uint32) *Table { |
||||
switch poly { |
||||
case IEEE: |
||||
ieeeOnce.Do(ieeeInit) |
||||
return IEEETable |
||||
case Castagnoli: |
||||
castagnoliOnce.Do(castagnoliInit) |
||||
return castagnoliTable |
||||
} |
||||
return simpleMakeTable(poly) |
||||
} |
||||
|
||||
// digest represents the partial evaluation of a checksum.
|
||||
type digest struct { |
||||
crc uint32 |
||||
tab *Table |
||||
} |
||||
|
||||
// New creates a new hash.Hash32 computing the CRC-32 checksum
|
||||
// using the polynomial represented by the Table.
|
||||
// Its Sum method will lay the value out in big-endian byte order.
|
||||
func New(tab *Table) hash.Hash32 { |
||||
if tab == IEEETable { |
||||
ieeeOnce.Do(ieeeInit) |
||||
} |
||||
return &digest{0, tab} |
||||
} |
||||
|
||||
// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
|
||||
// using the IEEE polynomial.
|
||||
// Its Sum method will lay the value out in big-endian byte order.
|
||||
func NewIEEE() hash.Hash32 { return New(IEEETable) } |
||||
|
||||
func (d *digest) Size() int { return Size } |
||||
|
||||
func (d *digest) BlockSize() int { return 1 } |
||||
|
||||
func (d *digest) Reset() { d.crc = 0 } |
||||
|
||||
// Update returns the result of adding the bytes in p to the crc.
|
||||
func Update(crc uint32, tab *Table, p []byte) uint32 { |
||||
switch tab { |
||||
case castagnoliTable: |
||||
return updateCastagnoli(crc, p) |
||||
case IEEETable: |
||||
// Unfortunately, because IEEETable is exported, IEEE may be used without a
|
||||
// call to MakeTable. We have to make sure it gets initialized in that case.
|
||||
ieeeOnce.Do(ieeeInit) |
||||
return updateIEEE(crc, p) |
||||
default: |
||||
return simpleUpdate(crc, tab, p) |
||||
} |
||||
} |
||||
|
||||
func (d *digest) Write(p []byte) (n int, err error) { |
||||
switch d.tab { |
||||
case castagnoliTable: |
||||
d.crc = updateCastagnoli(d.crc, p) |
||||
case IEEETable: |
||||
// We only create digest objects through New() which takes care of
|
||||
// initialization in this case.
|
||||
d.crc = updateIEEE(d.crc, p) |
||||
default: |
||||
d.crc = simpleUpdate(d.crc, d.tab, p) |
||||
} |
||||
return len(p), nil |
||||
} |
||||
|
||||
func (d *digest) Sum32() uint32 { return d.crc } |
||||
|
||||
func (d *digest) Sum(in []byte) []byte { |
||||
s := d.Sum32() |
||||
return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s)) |
||||
} |
||||
|
||||
// Checksum returns the CRC-32 checksum of data
|
||||
// using the polynomial represented by the Table.
|
||||
func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } |
||||
|
||||
// ChecksumIEEE returns the CRC-32 checksum of data
|
||||
// using the IEEE polynomial.
|
||||
func ChecksumIEEE(data []byte) uint32 { |
||||
ieeeOnce.Do(ieeeInit) |
||||
return updateIEEE(0, data) |
||||
} |
@ -1,230 +0,0 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !appengine,!gccgo
|
||||
|
||||
// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
|
||||
// description of the interface that each architecture-specific file
|
||||
// implements.
|
||||
|
||||
package crc32 |
||||
|
||||
import "unsafe" |
||||
|
||||
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
||||
// and IEEE CRC.
|
||||
|
||||
// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
|
||||
// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
|
||||
func haveSSE41() bool |
||||
func haveSSE42() bool |
||||
func haveCLMUL() bool |
||||
|
||||
// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
|
||||
// instruction.
|
||||
//go:noescape
|
||||
func castagnoliSSE42(crc uint32, p []byte) uint32 |
||||
|
||||
// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
|
||||
// instruction.
|
||||
//go:noescape
|
||||
func castagnoliSSE42Triple( |
||||
crcA, crcB, crcC uint32, |
||||
a, b, c []byte, |
||||
rounds uint32, |
||||
) (retA uint32, retB uint32, retC uint32) |
||||
|
||||
// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
|
||||
// instruction as well as SSE 4.1.
|
||||
//go:noescape
|
||||
func ieeeCLMUL(crc uint32, p []byte) uint32 |
||||
|
||||
var sse42 = haveSSE42() |
||||
var useFastIEEE = haveCLMUL() && haveSSE41() |
||||
|
||||
const castagnoliK1 = 168 |
||||
const castagnoliK2 = 1344 |
||||
|
||||
type sse42Table [4]Table |
||||
|
||||
var castagnoliSSE42TableK1 *sse42Table |
||||
var castagnoliSSE42TableK2 *sse42Table |
||||
|
||||
func archAvailableCastagnoli() bool { |
||||
return sse42 |
||||
} |
||||
|
||||
func archInitCastagnoli() { |
||||
if !sse42 { |
||||
panic("arch-specific Castagnoli not available") |
||||
} |
||||
castagnoliSSE42TableK1 = new(sse42Table) |
||||
castagnoliSSE42TableK2 = new(sse42Table) |
||||
// See description in updateCastagnoli.
|
||||
// t[0][i] = CRC(i000, O)
|
||||
// t[1][i] = CRC(0i00, O)
|
||||
// t[2][i] = CRC(00i0, O)
|
||||
// t[3][i] = CRC(000i, O)
|
||||
// where O is a sequence of K zeros.
|
||||
var tmp [castagnoliK2]byte |
||||
for b := 0; b < 4; b++ { |
||||
for i := 0; i < 256; i++ { |
||||
val := uint32(i) << uint32(b*8) |
||||
castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1]) |
||||
castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:]) |
||||
} |
||||
} |
||||
} |
||||
|
||||
// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
|
||||
// table given) with the given initial crc value. This corresponds to
|
||||
// CRC(crc, O) in the description in updateCastagnoli.
|
||||
func castagnoliShift(table *sse42Table, crc uint32) uint32 { |
||||
return table[3][crc>>24] ^ |
||||
table[2][(crc>>16)&0xFF] ^ |
||||
table[1][(crc>>8)&0xFF] ^ |
||||
table[0][crc&0xFF] |
||||
} |
||||
|
||||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 { |
||||
if !sse42 { |
||||
panic("not available") |
||||
} |
||||
|
||||
// This method is inspired from the algorithm in Intel's white paper:
|
||||
// "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
|
||||
// The same strategy of splitting the buffer in three is used but the
|
||||
// combining calculation is different; the complete derivation is explained
|
||||
// below.
|
||||
//
|
||||
// -- The basic idea --
|
||||
//
|
||||
// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
|
||||
// time. In recent Intel architectures the instruction takes 3 cycles;
|
||||
// however the processor can pipeline up to three instructions if they
|
||||
// don't depend on each other.
|
||||
//
|
||||
// Roughly this means that we can process three buffers in about the same
|
||||
// time we can process one buffer.
|
||||
//
|
||||
// The idea is then to split the buffer in three, CRC the three pieces
|
||||
// separately and then combine the results.
|
||||
//
|
||||
// Combining the results requires precomputed tables, so we must choose a
|
||||
// fixed buffer length to optimize. The longer the length, the faster; but
|
||||
// only buffers longer than this length will use the optimization. We choose
|
||||
// two cutoffs and compute tables for both:
|
||||
// - one around 512: 168*3=504
|
||||
// - one around 4KB: 1344*3=4032
|
||||
//
|
||||
// -- The nitty gritty --
|
||||
//
|
||||
// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
|
||||
// initial non-inverted CRC I). This function has the following properties:
|
||||
// (a) CRC(I, AB) = CRC(CRC(I, A), B)
|
||||
// (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
|
||||
//
|
||||
// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
|
||||
// K bytes each, where K is a fixed constant. Let O be the sequence of K zero
|
||||
// bytes.
|
||||
//
|
||||
// CRC(I, ABC) = CRC(I, ABO xor C)
|
||||
// = CRC(I, ABO) xor CRC(0, C)
|
||||
// = CRC(CRC(I, AB), O) xor CRC(0, C)
|
||||
// = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
|
||||
// = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
|
||||
// = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
|
||||
//
|
||||
// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
|
||||
// and CRC(0, C) efficiently. We just need to find a way to quickly compute
|
||||
// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
|
||||
// values; since we can't have a 32-bit table, we break it up into four
|
||||
// 8-bit tables:
|
||||
//
|
||||
// CRC(uvwx, O) = CRC(u000, O) xor
|
||||
// CRC(0v00, O) xor
|
||||
// CRC(00w0, O) xor
|
||||
// CRC(000x, O)
|
||||
//
|
||||
// We can compute tables corresponding to the four terms for all 8-bit
|
||||
// values.
|
||||
|
||||
crc = ^crc |
||||
|
||||
// If a buffer is long enough to use the optimization, process the first few
|
||||
// bytes to align the buffer to an 8 byte boundary (if necessary).
|
||||
if len(p) >= castagnoliK1*3 { |
||||
delta := int(uintptr(unsafe.Pointer(&p[0])) & 7) |
||||
if delta != 0 { |
||||
delta = 8 - delta |
||||
crc = castagnoliSSE42(crc, p[:delta]) |
||||
p = p[delta:] |
||||
} |
||||
} |
||||
|
||||
// Process 3*K2 at a time.
|
||||
for len(p) >= castagnoliK2*3 { |
||||
// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
|
||||
crcA, crcB, crcC := castagnoliSSE42Triple( |
||||
crc, 0, 0, |
||||
p, p[castagnoliK2:], p[castagnoliK2*2:], |
||||
castagnoliK2/24) |
||||
|
||||
// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
|
||||
crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB |
||||
// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
|
||||
crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC |
||||
p = p[castagnoliK2*3:] |
||||
} |
||||
|
||||
// Process 3*K1 at a time.
|
||||
for len(p) >= castagnoliK1*3 { |
||||
// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
|
||||
crcA, crcB, crcC := castagnoliSSE42Triple( |
||||
crc, 0, 0, |
||||
p, p[castagnoliK1:], p[castagnoliK1*2:], |
||||
castagnoliK1/24) |
||||
|
||||
// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
|
||||
crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB |
||||
// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
|
||||
crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC |
||||
p = p[castagnoliK1*3:] |
||||
} |
||||
|
||||
// Use the simple implementation for what's left.
|
||||
crc = castagnoliSSE42(crc, p) |
||||
return ^crc |
||||
} |
||||
|
||||
func archAvailableIEEE() bool { |
||||
return useFastIEEE |
||||
} |
||||
|
||||
var archIeeeTable8 *slicing8Table |
||||
|
||||
func archInitIEEE() { |
||||
if !useFastIEEE { |
||||
panic("not available") |
||||
} |
||||
// We still use slicing-by-8 for small buffers.
|
||||
archIeeeTable8 = slicingMakeTable(IEEE) |
||||
} |
||||
|
||||
func archUpdateIEEE(crc uint32, p []byte) uint32 { |
||||
if !useFastIEEE { |
||||
panic("not available") |
||||
} |
||||
|
||||
if len(p) >= 64 { |
||||
left := len(p) & 15 |
||||
do := len(p) - left |
||||
crc = ^ieeeCLMUL(^crc, p[:do]) |
||||
p = p[do:] |
||||
} |
||||
if len(p) == 0 { |
||||
return crc |
||||
} |
||||
return slicingUpdate(crc, archIeeeTable8, p) |
||||
} |
@ -1,319 +0,0 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved. |
||||
// Use of this source code is governed by a BSD-style |
||||
// license that can be found in the LICENSE file. |
||||
|
||||
// +build gc |
||||
|
||||
#define NOSPLIT 4 |
||||
#define RODATA 8 |
||||
|
||||
// castagnoliSSE42 updates the (non-inverted) crc with the given buffer. |
||||
// |
||||
// func castagnoliSSE42(crc uint32, p []byte) uint32 |
||||
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 |
||||
MOVL crc+0(FP), AX // CRC value |
||||
MOVQ p+8(FP), SI // data pointer |
||||
MOVQ p_len+16(FP), CX // len(p) |
||||
|
||||
// If there are fewer than 8 bytes to process, skip alignment. |
||||
CMPQ CX, $8 |
||||
JL less_than_8 |
||||
|
||||
MOVQ SI, BX |
||||
ANDQ $7, BX |
||||
JZ aligned |
||||
|
||||
// Process the first few bytes to 8-byte align the input. |
||||
|
||||
// BX = 8 - BX. We need to process this many bytes to align. |
||||
SUBQ $1, BX |
||||
XORQ $7, BX |
||||
|
||||
BTQ $0, BX |
||||
JNC align_2 |
||||
|
||||
CRC32B (SI), AX |
||||
DECQ CX |
||||
INCQ SI |
||||
|
||||
align_2: |
||||
BTQ $1, BX |
||||
JNC align_4 |
||||
|
||||
// CRC32W (SI), AX |
||||
BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
|
||||
|
||||
SUBQ $2, CX |
||||
ADDQ $2, SI |
||||
|
||||
align_4: |
||||
BTQ $2, BX |
||||
JNC aligned |
||||
|
||||
// CRC32L (SI), AX |
||||
BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
|
||||
|
||||
SUBQ $4, CX |
||||
ADDQ $4, SI |
||||
|
||||
aligned: |
||||
// The input is now 8-byte aligned and we can process 8-byte chunks. |
||||
CMPQ CX, $8 |
||||
JL less_than_8 |
||||
|
||||
CRC32Q (SI), AX |
||||
ADDQ $8, SI |
||||
SUBQ $8, CX |
||||
JMP aligned |
||||
|
||||
less_than_8: |
||||
// We may have some bytes left over; process 4 bytes, then 2, then 1.
|
||||
BTQ $2, CX |
||||
JNC less_than_4 |
||||
|
||||
// CRC32L (SI), AX |
||||
BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
|
||||
ADDQ $4, SI |
||||
|
||||
less_than_4: |
||||
BTQ $1, CX |
||||
JNC less_than_2 |
||||
|
||||
// CRC32W (SI), AX |
||||
BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
|
||||
ADDQ $2, SI |
||||
|
||||
less_than_2: |
||||
BTQ $0, CX |
||||
JNC done |
||||
|
||||
CRC32B (SI), AX |
||||
|
||||
done: |
||||
MOVL AX, ret+32(FP) |
||||
RET |
||||
|
||||
// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) |
||||
// bytes from each buffer. |
||||
// |
||||
// func castagnoliSSE42Triple( |
||||
// crc1, crc2, crc3 uint32, |
||||
// a, b, c []byte, |
||||
// rounds uint32, |
||||
// ) (retA uint32, retB uint32, retC uint32) |
||||
TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0 |
||||
MOVL crcA+0(FP), AX |
||||
MOVL crcB+4(FP), CX |
||||
MOVL crcC+8(FP), DX |
||||
|
||||
MOVQ a+16(FP), R8 // data pointer |
||||
MOVQ b+40(FP), R9 // data pointer |
||||
MOVQ c+64(FP), R10 // data pointer |
||||
|
||||
MOVL rounds+88(FP), R11 |
||||
|
||||
loop: |
||||
CRC32Q (R8), AX |
||||
CRC32Q (R9), CX |
||||
CRC32Q (R10), DX |
||||
|
||||
CRC32Q 8(R8), AX |
||||
CRC32Q 8(R9), CX |
||||
CRC32Q 8(R10), DX |
||||
|
||||
CRC32Q 16(R8), AX |
||||
CRC32Q 16(R9), CX |
||||
CRC32Q 16(R10), DX |
||||
|
||||
ADDQ $24, R8 |
||||
ADDQ $24, R9 |
||||
ADDQ $24, R10 |
||||
|
||||
DECQ R11 |
||||
JNZ loop |
||||
|
||||
MOVL AX, retA+96(FP) |
||||
MOVL CX, retB+100(FP) |
||||
MOVL DX, retC+104(FP) |
||||
RET |
||||
|
||||
// func haveSSE42() bool |
||||
TEXT ·haveSSE42(SB), NOSPLIT, $0 |
||||
XORQ AX, AX |
||||
INCL AX |
||||
CPUID |
||||
SHRQ $20, CX |
||||
ANDQ $1, CX |
||||
MOVB CX, ret+0(FP) |
||||
RET |
||||
|
||||
// func haveCLMUL() bool |
||||
TEXT ·haveCLMUL(SB), NOSPLIT, $0 |
||||
XORQ AX, AX |
||||
INCL AX |
||||
CPUID |
||||
SHRQ $1, CX |
||||
ANDQ $1, CX |
||||
MOVB CX, ret+0(FP) |
||||
RET |
||||
|
||||
// func haveSSE41() bool |
||||
TEXT ·haveSSE41(SB), NOSPLIT, $0 |
||||
XORQ AX, AX |
||||
INCL AX |
||||
CPUID |
||||
SHRQ $19, CX |
||||
ANDQ $1, CX |
||||
MOVB CX, ret+0(FP) |
||||
RET |
||||
|
||||
// CRC32 polynomial data |
||||
// |
||||
// These constants are lifted from the |
||||
// Linux kernel, since they avoid the costly |
||||
// PSHUFB 16 byte reversal proposed in the |
||||
// original Intel paper. |
||||
DATA r2r1kp<>+0(SB)/8, $0x154442bd4 |
||||
DATA r2r1kp<>+8(SB)/8, $0x1c6e41596 |
||||
DATA r4r3kp<>+0(SB)/8, $0x1751997d0 |
||||
DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e |
||||
DATA rupolykp<>+0(SB)/8, $0x1db710641 |
||||
DATA rupolykp<>+8(SB)/8, $0x1f7011641 |
||||
DATA r5kp<>+0(SB)/8, $0x163cd6124 |
||||
|
||||
GLOBL r2r1kp<>(SB), RODATA, $16 |
||||
GLOBL r4r3kp<>(SB), RODATA, $16 |
||||
GLOBL rupolykp<>(SB), RODATA, $16 |
||||
GLOBL r5kp<>(SB), RODATA, $8 |
||||
|
||||
// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
||||
// len(p) must be at least 64, and must be a multiple of 16. |
||||
|
||||
// func ieeeCLMUL(crc uint32, p []byte) uint32 |
||||
TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 |
||||
MOVL crc+0(FP), X0 // Initial CRC value |
||||
MOVQ p+8(FP), SI // data pointer |
||||
MOVQ p_len+16(FP), CX // len(p) |
||||
|
||||
MOVOU (SI), X1 |
||||
MOVOU 16(SI), X2 |
||||
MOVOU 32(SI), X3 |
||||
MOVOU 48(SI), X4 |
||||
PXOR X0, X1 |
||||
ADDQ $64, SI // buf+=64 |
||||
SUBQ $64, CX // len-=64 |
||||
CMPQ CX, $64 // Less than 64 bytes left |
||||
JB remain64 |
||||
|
||||
MOVOA r2r1kp<>+0(SB), X0 |
||||
|
||||
loopback64: |
||||
MOVOA X1, X5 |
||||
MOVOA X2, X6 |
||||
MOVOA X3, X7 |
||||
MOVOA X4, X8 |
||||
|
||||
PCLMULQDQ $0, X0, X1 |
||||
PCLMULQDQ $0, X0, X2 |
||||
PCLMULQDQ $0, X0, X3 |
||||
PCLMULQDQ $0, X0, X4 |
||||
|
||||
// Load next early |
||||
MOVOU (SI), X11 |
||||
MOVOU 16(SI), X12 |
||||
MOVOU 32(SI), X13 |
||||
MOVOU 48(SI), X14 |
||||
|
||||
PCLMULQDQ $0x11, X0, X5 |
||||
PCLMULQDQ $0x11, X0, X6 |
||||
PCLMULQDQ $0x11, X0, X7 |
||||
PCLMULQDQ $0x11, X0, X8 |
||||
|
||||
PXOR X5, X1 |
||||
PXOR X6, X2 |
||||
PXOR X7, X3 |
||||
PXOR X8, X4 |
||||
|
||||
PXOR X11, X1 |
||||
PXOR X12, X2 |
||||
PXOR X13, X3 |
||||
PXOR X14, X4 |
||||
|
||||
ADDQ $0x40, DI |
||||
ADDQ $64, SI // buf+=64 |
||||
SUBQ $64, CX // len-=64 |
||||
CMPQ CX, $64 // Less than 64 bytes left? |
||||
JGE loopback64 |
||||
|
||||
// Fold result into a single register (X1) |
||||
remain64: |
||||
MOVOA r4r3kp<>+0(SB), X0 |
||||
|
||||
MOVOA X1, X5 |
||||
PCLMULQDQ $0, X0, X1 |
||||
PCLMULQDQ $0x11, X0, X5 |
||||
PXOR X5, X1 |
||||
PXOR X2, X1 |
||||
|
||||
MOVOA X1, X5 |
||||
PCLMULQDQ $0, X0, X1 |
||||
PCLMULQDQ $0x11, X0, X5 |
||||
PXOR X5, X1 |
||||
PXOR X3, X1 |
||||
|
||||
MOVOA X1, X5 |
||||
PCLMULQDQ $0, X0, X1 |
||||
PCLMULQDQ $0x11, X0, X5 |
||||
PXOR X5, X1 |
||||
PXOR X4, X1 |
||||
|
||||
// If there is less than 16 bytes left we are done |
||||
CMPQ CX, $16 |
||||
JB finish |
||||
|
||||
// Encode 16 bytes |
||||
remain16: |
||||
MOVOU (SI), X10 |
||||
MOVOA X1, X5 |
||||
PCLMULQDQ $0, X0, X1 |
||||
PCLMULQDQ $0x11, X0, X5 |
||||
PXOR X5, X1 |
||||
PXOR X10, X1 |
||||
SUBQ $16, CX |
||||
ADDQ $16, SI |
||||
CMPQ CX, $16 |
||||
JGE remain16 |
||||
|
||||
finish: |
||||
// Fold final result into 32 bits and return it |
||||
PCMPEQB X3, X3 |
||||
PCLMULQDQ $1, X1, X0 |
||||
PSRLDQ $8, X1 |
||||
PXOR X0, X1 |
||||
|
||||
MOVOA X1, X2 |
||||
MOVQ r5kp<>+0(SB), X0 |
||||
|
||||
// Creates 32 bit mask. Note that we don't care about upper half. |
||||
PSRLQ $32, X3 |
||||
|
||||
PSRLDQ $4, X2 |
||||
PAND X3, X1 |
||||
PCLMULQDQ $0, X0, X1 |
||||
PXOR X2, X1 |
||||
|
||||
MOVOA rupolykp<>+0(SB), X0 |
||||
|
||||
MOVOA X1, X2 |
||||
PAND X3, X1 |
||||
PCLMULQDQ $0x10, X0, X1 |
||||
PAND X3, X1 |
||||
PCLMULQDQ $0, X0, X1 |
||||
PXOR X2, X1 |
||||
|
||||
// PEXTRD $1, X1, AX (SSE 4.1) |
||||
BYTE $0x66; BYTE $0x0f; BYTE $0x3a
|
||||
BYTE $0x16; BYTE $0xc8; BYTE $0x01
|
||||
MOVL AX, ret+32(FP) |
||||
|
||||
RET |
@ -1,43 +0,0 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !appengine,!gccgo
|
||||
|
||||
package crc32 |
||||
|
||||
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
||||
// CRC.
|
||||
|
||||
// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
|
||||
// support.
|
||||
func haveSSE42() bool |
||||
|
||||
// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
|
||||
// instruction.
|
||||
//go:noescape
|
||||
func castagnoliSSE42(crc uint32, p []byte) uint32 |
||||
|
||||
var sse42 = haveSSE42() |
||||
|
||||
func archAvailableCastagnoli() bool { |
||||
return sse42 |
||||
} |
||||
|
||||
func archInitCastagnoli() { |
||||
if !sse42 { |
||||
panic("not available") |
||||
} |
||||
// No initialization necessary.
|
||||
} |
||||
|
||||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 { |
||||
if !sse42 { |
||||
panic("not available") |
||||
} |
||||
return castagnoliSSE42(crc, p) |
||||
} |
||||
|
||||
func archAvailableIEEE() bool { return false } |
||||
func archInitIEEE() { panic("not available") } |
||||
func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } |
@ -1,67 +0,0 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved. |
||||
// Use of this source code is governed by a BSD-style |
||||
// license that can be found in the LICENSE file. |
||||
|
||||
// +build gc |
||||
|
||||
#define NOSPLIT 4 |
||||
#define RODATA 8 |
||||
|
||||
// func castagnoliSSE42(crc uint32, p []byte) uint32 |
||||
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 |
||||
MOVL crc+0(FP), AX // CRC value |
||||
MOVL p+4(FP), SI // data pointer |
||||
MOVL p_len+8(FP), CX // len(p) |
||||
|
||||
NOTL AX |
||||
|
||||
// If there's less than 8 bytes to process, we do it byte-by-byte. |
||||
CMPQ CX, $8 |
||||
JL cleanup |
||||
|
||||
// Process individual bytes until the input is 8-byte aligned. |
||||
startup: |
||||
MOVQ SI, BX |
||||
ANDQ $7, BX |
||||
JZ aligned |
||||
|
||||
CRC32B (SI), AX |
||||
DECQ CX |
||||
INCQ SI |
||||
JMP startup |
||||
|
||||
aligned: |
||||
// The input is now 8-byte aligned and we can process 8-byte chunks. |
||||
CMPQ CX, $8 |
||||
JL cleanup |
||||
|
||||
CRC32Q (SI), AX |
||||
ADDQ $8, SI |
||||
SUBQ $8, CX |
||||
JMP aligned |
||||
|
||||
cleanup: |
||||
// We may have some bytes left over that we process one at a time. |
||||
CMPQ CX, $0 |
||||
JE done |
||||
|
||||
CRC32B (SI), AX |
||||
INCQ SI |
||||
DECQ CX |
||||
JMP cleanup |
||||
|
||||
done: |
||||
NOTL AX |
||||
MOVL AX, ret+16(FP) |
||||
RET |
||||
|
||||
// func haveSSE42() bool |
||||
TEXT ·haveSSE42(SB), NOSPLIT, $0 |
||||
XORQ AX, AX |
||||
INCL AX |
||||
CPUID |
||||
SHRQ $20, CX |
||||
ANDQ $1, CX |
||||
MOVB CX, ret+0(FP) |
||||
RET |
||||
|
@ -1,89 +0,0 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// This file contains CRC32 algorithms that are not specific to any architecture
|
||||
// and don't use hardware acceleration.
|
||||
//
|
||||
// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
|
||||
//
|
||||
// The slicing-by-8 algorithm is a faster implementation that uses a bigger
|
||||
// table (8*256*4 bytes).
|
||||
|
||||
package crc32 |
||||
|
||||
// simpleMakeTable allocates and constructs a Table for the specified
|
||||
// polynomial. The table is suitable for use with the simple algorithm
|
||||
// (simpleUpdate).
|
||||
func simpleMakeTable(poly uint32) *Table { |
||||
t := new(Table) |
||||
simplePopulateTable(poly, t) |
||||
return t |
||||
} |
||||
|
||||
// simplePopulateTable constructs a Table for the specified polynomial, suitable
|
||||
// for use with simpleUpdate.
|
||||
func simplePopulateTable(poly uint32, t *Table) { |
||||
for i := 0; i < 256; i++ { |
||||
crc := uint32(i) |
||||
for j := 0; j < 8; j++ { |
||||
if crc&1 == 1 { |
||||
crc = (crc >> 1) ^ poly |
||||
} else { |
||||
crc >>= 1 |
||||
} |
||||
} |
||||
t[i] = crc |
||||
} |
||||
} |
||||
|
||||
// simpleUpdate uses the simple algorithm to update the CRC, given a table that
|
||||
// was previously computed using simpleMakeTable.
|
||||
func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 { |
||||
crc = ^crc |
||||
for _, v := range p { |
||||
crc = tab[byte(crc)^v] ^ (crc >> 8) |
||||
} |
||||
return ^crc |
||||
} |
||||
|
||||
// Use slicing-by-8 when payload >= this value.
|
||||
const slicing8Cutoff = 16 |
||||
|
||||
// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
|
||||
type slicing8Table [8]Table |
||||
|
||||
// slicingMakeTable constructs a slicing8Table for the specified polynomial. The
|
||||
// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
|
||||
func slicingMakeTable(poly uint32) *slicing8Table { |
||||
t := new(slicing8Table) |
||||
simplePopulateTable(poly, &t[0]) |
||||
for i := 0; i < 256; i++ { |
||||
crc := t[0][i] |
||||
for j := 1; j < 8; j++ { |
||||
crc = t[0][crc&0xFF] ^ (crc >> 8) |
||||
t[j][i] = crc |
||||
} |
||||
} |
||||
return t |
||||
} |
||||
|
||||
// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
|
||||
// table that was previously computed using slicingMakeTable.
|
||||
func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 { |
||||
if len(p) >= slicing8Cutoff { |
||||
crc = ^crc |
||||
for len(p) > 8 { |
||||
crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 |
||||
crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ |
||||
tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ |
||||
tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] |
||||
p = p[8:] |
||||
} |
||||
crc = ^crc |
||||
} |
||||
if len(p) == 0 { |
||||
return crc |
||||
} |
||||
return simpleUpdate(crc, &tab[0], p) |
||||
} |
@ -1,15 +0,0 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!amd64p32,!s390x
|
||||
|
||||
package crc32 |
||||
|
||||
func archAvailableIEEE() bool { return false } |
||||
func archInitIEEE() { panic("not available") } |
||||
func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } |
||||
|
||||
func archAvailableCastagnoli() bool { return false } |
||||
func archInitCastagnoli() { panic("not available") } |
||||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") } |
@ -1,91 +0,0 @@ |
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build s390x
|
||||
|
||||
package crc32 |
||||
|
||||
const ( |
||||
vxMinLen = 64 |
||||
vxAlignMask = 15 // align to 16 bytes
|
||||
) |
||||
|
||||
// hasVectorFacility reports whether the machine has the z/Architecture
|
||||
// vector facility installed and enabled.
|
||||
func hasVectorFacility() bool |
||||
|
||||
var hasVX = hasVectorFacility() |
||||
|
||||
// vectorizedCastagnoli implements CRC32 using vector instructions.
|
||||
// It is defined in crc32_s390x.s.
|
||||
//go:noescape
|
||||
func vectorizedCastagnoli(crc uint32, p []byte) uint32 |
||||
|
||||
// vectorizedIEEE implements CRC32 using vector instructions.
|
||||
// It is defined in crc32_s390x.s.
|
||||
//go:noescape
|
||||
func vectorizedIEEE(crc uint32, p []byte) uint32 |
||||
|
||||
func archAvailableCastagnoli() bool { |
||||
return hasVX |
||||
} |
||||
|
||||
var archCastagnoliTable8 *slicing8Table |
||||
|
||||
func archInitCastagnoli() { |
||||
if !hasVX { |
||||
panic("not available") |
||||
} |
||||
// We still use slicing-by-8 for small buffers.
|
||||
archCastagnoliTable8 = slicingMakeTable(Castagnoli) |
||||
} |
||||
|
||||
// archUpdateCastagnoli calculates the checksum of p using
|
||||
// vectorizedCastagnoli.
|
||||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 { |
||||
if !hasVX { |
||||
panic("not available") |
||||
} |
||||
// Use vectorized function if data length is above threshold.
|
||||
if len(p) >= vxMinLen { |
||||
aligned := len(p) & ^vxAlignMask |
||||
crc = vectorizedCastagnoli(crc, p[:aligned]) |
||||
p = p[aligned:] |
||||
} |
||||
if len(p) == 0 { |
||||
return crc |
||||
} |
||||
return slicingUpdate(crc, archCastagnoliTable8, p) |
||||
} |
||||
|
||||
func archAvailableIEEE() bool { |
||||
return hasVX |
||||
} |
||||
|
||||
var archIeeeTable8 *slicing8Table |
||||
|
||||
func archInitIEEE() { |
||||
if !hasVX { |
||||
panic("not available") |
||||
} |
||||
// We still use slicing-by-8 for small buffers.
|
||||
archIeeeTable8 = slicingMakeTable(IEEE) |
||||
} |
||||
|
||||
// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
|
||||
func archUpdateIEEE(crc uint32, p []byte) uint32 { |
||||
if !hasVX { |
||||
panic("not available") |
||||
} |
||||
// Use vectorized function if data length is above threshold.
|
||||
if len(p) >= vxMinLen { |
||||
aligned := len(p) & ^vxAlignMask |
||||
crc = vectorizedIEEE(crc, p[:aligned]) |
||||
p = p[aligned:] |
||||
} |
||||
if len(p) == 0 { |
||||
return crc |
||||
} |
||||
return slicingUpdate(crc, archIeeeTable8, p) |
||||
} |
@ -1,249 +0,0 @@ |
||||
// Copyright 2016 The Go Authors. All rights reserved. |
||||
// Use of this source code is governed by a BSD-style |
||||
// license that can be found in the LICENSE file. |
||||
|
||||
// +build s390x |
||||
|
||||
#include "textflag.h" |
||||
|
||||
// Vector register range containing CRC-32 constants |
||||
|
||||
#define CONST_PERM_LE2BE V9 |
||||
#define CONST_R2R1 V10 |
||||
#define CONST_R4R3 V11 |
||||
#define CONST_R5 V12 |
||||
#define CONST_RU_POLY V13 |
||||
#define CONST_CRC_POLY V14 |
||||
|
||||
// The CRC-32 constant block contains reduction constants to fold and |
||||
// process particular chunks of the input data stream in parallel. |
||||
// |
||||
// Note that the constant definitions below are extended in order to compute |
||||
// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. |
||||
// The rightmost doubleword can be 0 to prevent contribution to the result or |
||||
// can be multiplied by 1 to perform an XOR without the need for a separate |
||||
// VECTOR EXCLUSIVE OR instruction. |
||||
// |
||||
// The polynomials used are bit-reflected: |
||||
// |
||||
// IEEE: P'(x) = 0x0edb88320 |
||||
// Castagnoli: P'(x) = 0x082f63b78 |
||||
|
||||
// IEEE polynomial constants |
||||
DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask |
||||
DATA ·crcleconskp+8(SB)/8, $0x0706050403020100 |
||||
DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2 |
||||
DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1 |
||||
DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4 |
||||
DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3 |
||||
DATA ·crcleconskp+48(SB)/8, $0x0000000000000000 |
||||
DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5 |
||||
DATA ·crcleconskp+64(SB)/8, $0x0000000000000000 |
||||
DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u' |
||||
DATA ·crcleconskp+80(SB)/8, $0x0000000000000000 |
||||
DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 |
||||
|
||||
GLOBL ·crcleconskp(SB), RODATA, $144 |
||||
|
||||
// Castagonli Polynomial constants |
||||
DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask |
||||
DATA ·crccleconskp+8(SB)/8, $0x0706050403020100 |
||||
DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2 |
||||
DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1 |
||||
DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4 |
||||
DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3 |
||||
DATA ·crccleconskp+48(SB)/8, $0x0000000000000000 |
||||
DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5 |
||||
DATA ·crccleconskp+64(SB)/8, $0x0000000000000000 |
||||
DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u' |
||||
DATA ·crccleconskp+80(SB)/8, $0x0000000000000000 |
||||
DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 |
||||
|
||||
GLOBL ·crccleconskp(SB), RODATA, $144 |
||||
|
||||
// func hasVectorFacility() bool |
||||
TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 |
||||
MOVD $x-24(SP), R1 |
||||
XC $24, 0(R1), 0(R1) // clear the storage |
||||
MOVD $2, R0 // R0 is the number of double words stored -1 |
||||
WORD $0xB2B01000 // STFLE 0(R1) |
||||
XOR R0, R0 // reset the value of R0 |
||||
MOVBZ z-8(SP), R1 |
||||
AND $0x40, R1 |
||||
BEQ novector |
||||
|
||||
vectorinstalled: |
||||
// check if the vector instruction has been enabled |
||||
VLEIB $0, $0xF, V16 |
||||
VLGVB $0, V16, R1 |
||||
CMPBNE R1, $0xF, novector |
||||
MOVB $1, ret+0(FP) // have vx |
||||
RET |
||||
|
||||
novector: |
||||
MOVB $0, ret+0(FP) // no vx |
||||
RET |
||||
|
||||
// The CRC-32 function(s) use these calling conventions: |
||||
// |
||||
// Parameters: |
||||
// |
||||
// R2: Initial CRC value, typically ~0; and final CRC (return) value.
|
||||
// R3: Input buffer pointer, performance might be improved if the |
||||
// buffer is on a doubleword boundary. |
||||
// R4: Length of the buffer, must be 64 bytes or greater. |
||||
// |
||||
// Register usage: |
||||
// |
||||
// R5: CRC-32 constant pool base pointer. |
||||
// V0: Initial CRC value and intermediate constants and results. |
||||
// V1..V4: Data for CRC computation. |
||||
// V5..V8: Next data chunks that are fetched from the input buffer. |
||||
// |
||||
// V9..V14: CRC-32 constants. |
||||
|
||||
// func vectorizedIEEE(crc uint32, p []byte) uint32 |
||||
TEXT ·vectorizedIEEE(SB), NOSPLIT, $0 |
||||
MOVWZ crc+0(FP), R2 // R2 stores the CRC value |
||||
MOVD p+8(FP), R3 // data pointer |
||||
MOVD p_len+16(FP), R4 // len(p) |
||||
|
||||
MOVD $·crcleconskp(SB), R5 |
||||
BR vectorizedBody<>(SB) |
||||
|
||||
// func vectorizedCastagnoli(crc uint32, p []byte) uint32 |
||||
TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0 |
||||
MOVWZ crc+0(FP), R2 // R2 stores the CRC value |
||||
MOVD p+8(FP), R3 // data pointer |
||||
MOVD p_len+16(FP), R4 // len(p) |
||||
|
||||
// R5: crc-32 constant pool base pointer, constant is used to reduce crc |
||||
MOVD $·crccleconskp(SB), R5 |
||||
BR vectorizedBody<>(SB) |
||||
|
||||
TEXT vectorizedBody<>(SB), NOSPLIT, $0 |
||||
XOR $0xffffffff, R2 // NOTW R2 |
||||
VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY |
||||
|
||||
// Load the initial CRC value into the rightmost word of V0 |
||||
VZERO V0 |
||||
VLVGF $3, R2, V0 |
||||
|
||||
// Crash if the input size is less than 64-bytes. |
||||
CMP R4, $64 |
||||
BLT crash |
||||
|
||||
// Load a 64-byte data chunk and XOR with CRC |
||||
VLM 0(R3), V1, V4 // 64-bytes into V1..V4 |
||||
|
||||
// Reflect the data if the CRC operation is in the bit-reflected domain |
||||
VPERM V1, V1, CONST_PERM_LE2BE, V1 |
||||
VPERM V2, V2, CONST_PERM_LE2BE, V2 |
||||
VPERM V3, V3, CONST_PERM_LE2BE, V3 |
||||
VPERM V4, V4, CONST_PERM_LE2BE, V4 |
||||
|
||||
VX V0, V1, V1 // V1 ^= CRC |
||||
ADD $64, R3 // BUF = BUF + 64 |
||||
ADD $(-64), R4 |
||||
|
||||
// Check remaining buffer size and jump to proper folding method |
||||
CMP R4, $64 |
||||
BLT less_than_64bytes |
||||
|
||||
fold_64bytes_loop: |
||||
// Load the next 64-byte data chunk into V5 to V8 |
||||
VLM 0(R3), V5, V8 |
||||
VPERM V5, V5, CONST_PERM_LE2BE, V5 |
||||
VPERM V6, V6, CONST_PERM_LE2BE, V6 |
||||
VPERM V7, V7, CONST_PERM_LE2BE, V7 |
||||
VPERM V8, V8, CONST_PERM_LE2BE, V8 |
||||
|
||||
// Perform a GF(2) multiplication of the doublewords in V1 with |
||||
// the reduction constants in V0. The intermediate result is |
||||
// then folded (accumulated) with the next data chunk in V5 and |
||||
// stored in V1. Repeat this step for the register contents |
||||
// in V2, V3, and V4 respectively. |
||||
|
||||
VGFMAG CONST_R2R1, V1, V5, V1 |
||||
VGFMAG CONST_R2R1, V2, V6, V2 |
||||
VGFMAG CONST_R2R1, V3, V7, V3 |
||||
VGFMAG CONST_R2R1, V4, V8, V4 |
||||
|
||||
// Adjust buffer pointer and length for next loop |
||||
ADD $64, R3 // BUF = BUF + 64 |
||||
ADD $(-64), R4 // LEN = LEN - 64 |
||||
|
||||
CMP R4, $64 |
||||
BGE fold_64bytes_loop |
||||
|
||||
less_than_64bytes: |
||||
// Fold V1 to V4 into a single 128-bit value in V1 |
||||
VGFMAG CONST_R4R3, V1, V2, V1 |
||||
VGFMAG CONST_R4R3, V1, V3, V1 |
||||
VGFMAG CONST_R4R3, V1, V4, V1 |
||||
|
||||
// Check whether to continue with 64-bit folding |
||||
CMP R4, $16 |
||||
BLT final_fold |
||||
|
||||
fold_16bytes_loop: |
||||
VL 0(R3), V2 // Load next data chunk |
||||
VPERM V2, V2, CONST_PERM_LE2BE, V2 |
||||
|
||||
VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk |
||||
|
||||
// Adjust buffer pointer and size for folding next data chunk |
||||
ADD $16, R3 |
||||
ADD $-16, R4 |
||||
|
||||
// Process remaining data chunks |
||||
CMP R4, $16 |
||||
BGE fold_16bytes_loop |
||||
|
||||
final_fold: |
||||
VLEIB $7, $0x40, V9 |
||||
VSRLB V9, CONST_R4R3, V0 |
||||
VLEIG $0, $1, V0 |
||||
|
||||
VGFMG V0, V1, V1 |
||||
|
||||
VLEIB $7, $0x20, V9 // Shift by words |
||||
VSRLB V9, V1, V2 // Store remaining bits in V2 |
||||
VUPLLF V1, V1 // Split rightmost doubleword |
||||
VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 |
||||
|
||||
// The input values to the Barret reduction are the degree-63 polynomial |
||||
// in V1 (R(x)), degree-32 generator polynomial, and the reduction |
||||
// constant u. The Barret reduction result is the CRC value of R(x) mod |
||||
// P(x). |
||||
// |
||||
// The Barret reduction algorithm is defined as: |
||||
// |
||||
// 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u |
||||
// 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) |
||||
// 3. C(x) = R(x) XOR T2(x) mod x^32 |
||||
// |
||||
// Note: To compensate the division by x^32, use the vector unpack |
||||
// instruction to move the leftmost word into the leftmost doubleword |
||||
// of the vector register. The rightmost doubleword is multiplied |
||||
// with zero to not contribute to the intermedate results. |
||||
|
||||
// T1(x) = floor( R(x) / x^32 ) GF2MUL u |
||||
VUPLLF V1, V2 |
||||
VGFMG CONST_RU_POLY, V2, V2 |
||||
|
||||
// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in |
||||
// V2 and XOR the intermediate result, T2(x), with the value in V1. |
||||
// The final result is in the rightmost word of V2. |
||||
|
||||
VUPLLF V2, V2 |
||||
VGFMAG CONST_CRC_POLY, V2, V1, V2 |
||||
|
||||
done: |
||||
VLGVF $2, V2, R2 |
||||
XOR $0xffffffff, R2 // NOTW R2 |
||||
MOVWZ R2, ret + 32(FP) |
||||
RET |
||||
|
||||
crash: |
||||
MOVD $0, (R0) // input size is less than 64-bytes |
Loading…
Reference in new issue