Add additional password hash algorithms (closes #5859) (#6023)

6 years ago · 8d9d6aa903
parent 1b85b248e4
commit 8d9d6aa903
23 changed files with 2898 additions and 23 deletions
--- a/custom/conf/app.ini.sample
+++ b/custom/conf/app.ini.sample
@ -319,6 +319,8 @@ MIN_PASSWORD_LENGTH = 6
 IMPORT_LOCAL_PATHS = false
 ; Set to true to prevent all users (including admin) from creating custom git hooks
 DISABLE_GIT_HOOKS = false
 ; Password Hash algorithm, either "pbkdf2", "argon2", "scrypt" or "bcrypt"
 PASSWORD_HASH_ALGO = pbkdf2
 [openid]
 ;
--- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md
+++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md
@ -197,6 +197,7 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`.
 - `IMPORT_LOCAL_PATHS`: **false**: Set to `false` to prevent all users (including admin) from importing local path on server.
 - `INTERNAL_TOKEN`: **\<random at every install if no uri set\>**: Secret used to validate communication within Gitea binary.
 - `INTERNAL_TOKEN_URI`: **<empty>**: Instead of defining internal token in the configuration, this configuration option can be used to give Gitea a path to a file that contains the internal token (example value: `file:/etc/gitea/internal_token`)
 - `PASSWORD_HASH_ALGO`: **pbkdf2**: The hash algorithm to use \[pbkdf2, argon2, scrypt, bcrypt\].
 ## OpenID (`openid`)
--- a/models/login_source.go
+++ b/models/login_source.go
@ -22,6 +22,7 @@ import (
 	"code.gitea.io/gitea/modules/auth/oauth2"
 	"code.gitea.io/gitea/modules/auth/pam"
 	"code.gitea.io/gitea/modules/log"
 	"code.gitea.io/gitea/modules/setting"
 	"code.gitea.io/gitea/modules/util"
 )
@ -665,6 +666,15 @@ func UserSignIn(username, password string) (*User, error) {
 		switch user.LoginType {
 		case LoginNoType, LoginPlain, LoginOAuth2:
 			if user.IsPasswordSet() && user.ValidatePassword(password) {
 				// Update password hash if server password hash algorithm have changed
 				if user.PasswdHashAlgo != setting.PasswordHashAlgo {
 					user.HashPassword(password)
 					if err := UpdateUserCols(user, "passwd", "passwd_hash_algo"); err != nil {
 						return nil, err
 					}
 				}
 				// WARN: DON'T check user.IsActive, that will be checked on reqSign so that
 				// user could be hint to resend confirm email.
 				if user.ProhibitLogin {
--- a/models/user.go
+++ b/models/user.go
@ -33,7 +33,10 @@ import (
 	"github.com/Unknwon/com"
 	"github.com/go-xorm/xorm"
 	"golang.org/x/crypto/argon2"
 	"golang.org/x/crypto/bcrypt"
 	"golang.org/x/crypto/pbkdf2"
 	"golang.org/x/crypto/scrypt"
 	"golang.org/x/crypto/ssh"
 	"xorm.io/builder"
 	"xorm.io/core"
@ -50,6 +53,13 @@ const (
 	UserTypeOrganization
 )
 const (
 	algoBcrypt = "bcrypt"
 	algoScrypt = "scrypt"
 	algoArgon2 = "argon2"
 	algoPbkdf2 = "pbkdf2"
 )
 const syncExternalUsers = "sync_external_users"
 var (
@ -82,6 +92,7 @@ type User struct {
 	Email            string `xorm:"NOT NULL"`
 	KeepEmailPrivate bool
 	Passwd           string `xorm:"NOT NULL"`
 	PasswdHashAlgo   string `xorm:"NOT NULL DEFAULT 'pbkdf2'"`
 	// MustChangePassword is an attribute that determines if a user
 	// is to change his/her password after registration.
@ -430,25 +441,48 @@ func (u *User) NewGitSig() *git.Signature {
 	}
 }
-func hashPassword(passwd, salt string) string {
+func hashPassword(passwd, salt, algo string) string {
-	tempPasswd := pbkdf2.Key([]byte(passwd), []byte(salt), 10000, 50, sha256.New)
+	var tempPasswd []byte
 	switch algo {
 	case algoBcrypt:
 		tempPasswd, _ = bcrypt.GenerateFromPassword([]byte(passwd), bcrypt.DefaultCost)
 		return string(tempPasswd)
 	case algoScrypt:
 		tempPasswd, _ = scrypt.Key([]byte(passwd), []byte(salt), 65536, 16, 2, 50)
 	case algoArgon2:
 		tempPasswd = argon2.IDKey([]byte(passwd), []byte(salt), 2, 65536, 8, 50)
 	case algoPbkdf2:
 		fallthrough
 	default:
 		tempPasswd = pbkdf2.Key([]byte(passwd), []byte(salt), 10000, 50, sha256.New)
 	}
 	return fmt.Sprintf("%x", tempPasswd)
 }
-// HashPassword hashes a password using PBKDF.
+// HashPassword hashes a password using the algorithm defined in the config value of PASSWORD_HASH_ALGO.
 func (u *User) HashPassword(passwd string) {
-	u.Passwd = hashPassword(passwd, u.Salt)
+	u.PasswdHashAlgo = setting.PasswordHashAlgo
 	u.Passwd = hashPassword(passwd, u.Salt, setting.PasswordHashAlgo)
 }
 // ValidatePassword checks if given password matches the one belongs to the user.
 func (u *User) ValidatePassword(passwd string) bool {
-	tempHash := hashPassword(passwd, u.Salt)
+	tempHash := hashPassword(passwd, u.Salt, u.PasswdHashAlgo)
-	return subtle.ConstantTimeCompare([]byte(u.Passwd), []byte(tempHash)) == 1
+
 	if u.PasswdHashAlgo != algoBcrypt && subtle.ConstantTimeCompare([]byte(u.Passwd), []byte(tempHash)) == 1 {
 		return true
 	}
 	if u.PasswdHashAlgo == algoBcrypt && bcrypt.CompareHashAndPassword([]byte(u.Passwd), []byte(passwd)) == nil {
 		return true
 	}
 	return false
 }
 // IsPasswordSet checks if the password is set or left empty
 func (u *User) IsPasswordSet() bool {
-	return !u.ValidatePassword("")
+	return len(u.Passwd) > 0
 }
 // UploadAvatar saves custom avatar for user.
--- a/models/user_test.go
+++ b/models/user_test.go
@ -147,6 +147,9 @@ func TestHashPasswordDeterministic(t *testing.T) {
 	b := make([]byte, 16)
 	rand.Read(b)
 	u := &User{Salt: string(b)}
 	algos := []string{"pbkdf2", "argon2", "scrypt", "bcrypt"}
 	for j := 0; j < len(algos); j++ {
 		u.PasswdHashAlgo = algos[j]
 		for i := 0; i < 50; i++ {
 			// generate a random password
 			rand.Read(b)
@ -160,10 +163,15 @@ func TestHashPasswordDeterministic(t *testing.T) {
 			u.HashPassword(pass)
 			r2 := u.Passwd
-		// assert equal (given the same salt+pass, the same result is produced)
+			// assert equal (given the same salt+pass, the same result is produced) except bcrypt
 			if u.PasswdHashAlgo == "bcrypt" {
 				assert.NotEqual(t, r1, r2)
 			} else {
 				assert.Equal(t, r1, r2)
 			}
 		}
 	}
 }
 func BenchmarkHashPassword(b *testing.B) {
 	// BenchmarkHashPassword ensures that it takes a reasonable amount of time
--- a/modules/setting/setting.go
+++ b/modules/setting/setting.go
@ -154,6 +154,7 @@ var (
 	MinPasswordLength     int
 	ImportLocalPaths      bool
 	DisableGitHooks       bool
 	PasswordHashAlgo      string
 	// Database settings
 	UseSQLite3       bool
@ -779,6 +780,7 @@ func NewContext() {
 	MinPasswordLength = sec.Key("MIN_PASSWORD_LENGTH").MustInt(6)
 	ImportLocalPaths = sec.Key("IMPORT_LOCAL_PATHS").MustBool(false)
 	DisableGitHooks = sec.Key("DISABLE_GIT_HOOKS").MustBool(false)
 	PasswordHashAlgo = sec.Key("PASSWORD_HASH_ALGO").MustString("pbkdf2")
 	InternalToken = loadInternalToken(sec)
 	IterateBufferSize = Cfg.Section("database").Key("ITERATE_BUFFER_SIZE").MustInt(50)
 	LogSQL = Cfg.Section("database").Key("LOG_SQL").MustBool(true)
--- a/vendor/golang.org/x/crypto/argon2/argon2.go
+++ b/vendor/golang.org/x/crypto/argon2/argon2.go
@ -0,0 +1,285 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package argon2 implements the key derivation function Argon2.
 // Argon2 was selected as the winner of the Password Hashing Competition and can
 // be used to derive cryptographic keys from passwords.
 //
 // For a detailed specification of Argon2 see [1].
 //
 // If you aren't sure which function you need, use Argon2id (IDKey) and
 // the parameter recommendations for your scenario.
 //
 //
 // Argon2i
 //
 // Argon2i (implemented by Key) is the side-channel resistant version of Argon2.
 // It uses data-independent memory access, which is preferred for password
 // hashing and password-based key derivation. Argon2i requires more passes over
 // memory than Argon2id to protect from trade-off attacks. The recommended
 // parameters (taken from [2]) for non-interactive operations are time=3 and to
 // use the maximum available memory.
 //
 //
 // Argon2id
 //
 // Argon2id (implemented by IDKey) is a hybrid version of Argon2 combining
 // Argon2i and Argon2d. It uses data-independent memory access for the first
 // half of the first iteration over the memory and data-dependent memory access
 // for the rest. Argon2id is side-channel resistant and provides better brute-
 // force cost savings due to time-memory tradeoffs than Argon2i. The recommended
 // parameters for non-interactive operations (taken from [2]) are time=1 and to
 // use the maximum available memory.
 //
 // [1] https://github.com/P-H-C/phc-winner-argon2/blob/master/argon2-specs.pdf
 // [2] https://tools.ietf.org/html/draft-irtf-cfrg-argon2-03#section-9.3
 package argon2
 import (
 	"encoding/binary"
 	"sync"
 	"golang.org/x/crypto/blake2b"
 )
 // The Argon2 version implemented by this package.
 const Version = 0x13
 const (
 	argon2d = iota
 	argon2i
 	argon2id
 )
 // Key derives a key from the password, salt, and cost parameters using Argon2i
 // returning a byte slice of length keyLen that can be used as cryptographic
 // key. The CPU cost and parallelism degree must be greater than zero.
 //
 // For example, you can get a derived key for e.g. AES-256 (which needs a
 // 32-byte key) by doing:
 //
 //      key := argon2.Key([]byte("some password"), salt, 3, 32*1024, 4, 32)
 //
 // The draft RFC recommends[2] time=3, and memory=32*1024 is a sensible number.
 // If using that amount of memory (32 MB) is not possible in some contexts then
 // the time parameter can be increased to compensate.
 //
 // The time parameter specifies the number of passes over the memory and the
 // memory parameter specifies the size of the memory in KiB. For example
 // memory=32*1024 sets the memory cost to ~32 MB. The number of threads can be
 // adjusted to the number of available CPUs. The cost parameters should be
 // increased as memory latency and CPU parallelism increases. Remember to get a
 // good random salt.
 func Key(password, salt []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
 	return deriveKey(argon2i, password, salt, nil, nil, time, memory, threads, keyLen)
 }
 // IDKey derives a key from the password, salt, and cost parameters using
 // Argon2id returning a byte slice of length keyLen that can be used as
 // cryptographic key. The CPU cost and parallelism degree must be greater than
 // zero.
 //
 // For example, you can get a derived key for e.g. AES-256 (which needs a
 // 32-byte key) by doing:
 //
 //      key := argon2.IDKey([]byte("some password"), salt, 1, 64*1024, 4, 32)
 //
 // The draft RFC recommends[2] time=1, and memory=64*1024 is a sensible number.
 // If using that amount of memory (64 MB) is not possible in some contexts then
 // the time parameter can be increased to compensate.
 //
 // The time parameter specifies the number of passes over the memory and the
 // memory parameter specifies the size of the memory in KiB. For example
 // memory=64*1024 sets the memory cost to ~64 MB. The number of threads can be
 // adjusted to the numbers of available CPUs. The cost parameters should be
 // increased as memory latency and CPU parallelism increases. Remember to get a
 // good random salt.
 func IDKey(password, salt []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
 	return deriveKey(argon2id, password, salt, nil, nil, time, memory, threads, keyLen)
 }
 func deriveKey(mode int, password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
 	if time < 1 {
 		panic("argon2: number of rounds too small")
 	}
 	if threads < 1 {
 		panic("argon2: parallelism degree too low")
 	}
 	h0 := initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
 	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
 	if memory < 2*syncPoints*uint32(threads) {
 		memory = 2 * syncPoints * uint32(threads)
 	}
 	B := initBlocks(&h0, memory, uint32(threads))
 	processBlocks(B, time, memory, uint32(threads), mode)
 	return extractKey(B, memory, uint32(threads), keyLen)
 }
 const (
 	blockLength = 128
 	syncPoints  = 4
 )
 type block [blockLength]uint64
 func initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte {
 	var (
 		h0     [blake2b.Size + 8]byte
 		params [24]byte
 		tmp    [4]byte
 	)
 	b2, _ := blake2b.New512(nil)
 	binary.LittleEndian.PutUint32(params[0:4], threads)
 	binary.LittleEndian.PutUint32(params[4:8], keyLen)
 	binary.LittleEndian.PutUint32(params[8:12], memory)
 	binary.LittleEndian.PutUint32(params[12:16], time)
 	binary.LittleEndian.PutUint32(params[16:20], uint32(Version))
 	binary.LittleEndian.PutUint32(params[20:24], uint32(mode))
 	b2.Write(params[:])
 	binary.LittleEndian.PutUint32(tmp[:], uint32(len(password)))
 	b2.Write(tmp[:])
 	b2.Write(password)
 	binary.LittleEndian.PutUint32(tmp[:], uint32(len(salt)))
 	b2.Write(tmp[:])
 	b2.Write(salt)
 	binary.LittleEndian.PutUint32(tmp[:], uint32(len(key)))
 	b2.Write(tmp[:])
 	b2.Write(key)
 	binary.LittleEndian.PutUint32(tmp[:], uint32(len(data)))
 	b2.Write(tmp[:])
 	b2.Write(data)
 	b2.Sum(h0[:0])
 	return h0
 }
 func initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []block {
 	var block0 [1024]byte
 	B := make([]block, memory)
 	for lane := uint32(0); lane < threads; lane++ {
 		j := lane * (memory / threads)
 		binary.LittleEndian.PutUint32(h0[blake2b.Size+4:], lane)
 		binary.LittleEndian.PutUint32(h0[blake2b.Size:], 0)
 		blake2bHash(block0[:], h0[:])
 		for i := range B[j+0] {
 			B[j+0][i] = binary.LittleEndian.Uint64(block0[i*8:])
 		}
 		binary.LittleEndian.PutUint32(h0[blake2b.Size:], 1)
 		blake2bHash(block0[:], h0[:])
 		for i := range B[j+1] {
 			B[j+1][i] = binary.LittleEndian.Uint64(block0[i*8:])
 		}
 	}
 	return B
 }
 func processBlocks(B []block, time, memory, threads uint32, mode int) {
 	lanes := memory / threads
 	segments := lanes / syncPoints
 	processSegment := func(n, slice, lane uint32, wg *sync.WaitGroup) {
 		var addresses, in, zero block
 		if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
 			in[0] = uint64(n)
 			in[1] = uint64(lane)
 			in[2] = uint64(slice)
 			in[3] = uint64(memory)
 			in[4] = uint64(time)
 			in[5] = uint64(mode)
 		}
 		index := uint32(0)
 		if n == 0 && slice == 0 {
 			index = 2 // we have already generated the first two blocks
 			if mode == argon2i || mode == argon2id {
 				in[6]++
 				processBlock(&addresses, &in, &zero)
 				processBlock(&addresses, &addresses, &zero)
 			}
 		}
 		offset := lane*lanes + slice*segments + index
 		var random uint64
 		for index < segments {
 			prev := offset - 1
 			if index == 0 && slice == 0 {
 				prev += lanes // last block in lane
 			}
 			if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
 				if index%blockLength == 0 {
 					in[6]++
 					processBlock(&addresses, &in, &zero)
 					processBlock(&addresses, &addresses, &zero)
 				}
 				random = addresses[index%blockLength]
 			} else {
 				random = B[prev][0]
 			}
 			newOffset := indexAlpha(random, lanes, segments, threads, n, slice, lane, index)
 			processBlockXOR(&B[offset], &B[prev], &B[newOffset])
 			index, offset = index+1, offset+1
 		}
 		wg.Done()
 	}
 	for n := uint32(0); n < time; n++ {
 		for slice := uint32(0); slice < syncPoints; slice++ {
 			var wg sync.WaitGroup
 			for lane := uint32(0); lane < threads; lane++ {
 				wg.Add(1)
 				go processSegment(n, slice, lane, &wg)
 			}
 			wg.Wait()
 		}
 	}
 }
 func extractKey(B []block, memory, threads, keyLen uint32) []byte {
 	lanes := memory / threads
 	for lane := uint32(0); lane < threads-1; lane++ {
 		for i, v := range B[(lane*lanes)+lanes-1] {
 			B[memory-1][i] ^= v
 		}
 	}
 	var block [1024]byte
 	for i, v := range B[memory-1] {
 		binary.LittleEndian.PutUint64(block[i*8:], v)
 	}
 	key := make([]byte, keyLen)
 	blake2bHash(key, block[:])
 	return key
 }
 func indexAlpha(rand uint64, lanes, segments, threads, n, slice, lane, index uint32) uint32 {
 	refLane := uint32(rand>>32) % threads
 	if n == 0 && slice == 0 {
 		refLane = lane
 	}
 	m, s := 3*segments, ((slice+1)%syncPoints)*segments
 	if lane == refLane {
 		m += index
 	}
 	if n == 0 {
 		m, s = slice*segments, 0
 		if slice == 0 || lane == refLane {
 			m += index
 		}
 	}
 	if index == 0 || lane == refLane {
 		m--
 	}
 	return phi(rand, uint64(m), uint64(s), refLane, lanes)
 }
 func phi(rand, m, s uint64, lane, lanes uint32) uint32 {
 	p := rand & 0xFFFFFFFF
 	p = (p * p) >> 32
 	p = (p * m) >> 32
 	return lane*lanes + uint32((s+m-(p+1))%uint64(lanes))
 }
--- a/vendor/golang.org/x/crypto/argon2/blake2b.go
+++ b/vendor/golang.org/x/crypto/argon2/blake2b.go
@ -0,0 +1,53 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package argon2
 import (
 	"encoding/binary"
 	"hash"
 	"golang.org/x/crypto/blake2b"
 )
 // blake2bHash computes an arbitrary long hash value of in
 // and writes the hash to out.
 func blake2bHash(out []byte, in []byte) {
 	var b2 hash.Hash
 	if n := len(out); n < blake2b.Size {
 		b2, _ = blake2b.New(n, nil)
 	} else {
 		b2, _ = blake2b.New512(nil)
 	}
 	var buffer [blake2b.Size]byte
 	binary.LittleEndian.PutUint32(buffer[:4], uint32(len(out)))
 	b2.Write(buffer[:4])
 	b2.Write(in)
 	if len(out) <= blake2b.Size {
 		b2.Sum(out[:0])
 		return
 	}
 	outLen := len(out)
 	b2.Sum(buffer[:0])
 	b2.Reset()
 	copy(out, buffer[:32])
 	out = out[32:]
 	for len(out) > blake2b.Size {
 		b2.Write(buffer[:])
 		b2.Sum(buffer[:0])
 		copy(out, buffer[:32])
 		out = out[32:]
 		b2.Reset()
 	}
 	if outLen%blake2b.Size > 0 { // outLen > 64
 		r := ((outLen + 31) / 32) - 2 // ⌈τ /32⌉-2
 		b2, _ = blake2b.New(outLen-32*r, nil)
 	}
 	b2.Write(buffer[:])
 	b2.Sum(out[:0])
 }
--- a/vendor/golang.org/x/crypto/argon2/blamka_amd64.go
+++ b/vendor/golang.org/x/crypto/argon2/blamka_amd64.go
@ -0,0 +1,60 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build amd64,!gccgo,!appengine
 package argon2
 import "golang.org/x/sys/cpu"
 func init() {
 	useSSE4 = cpu.X86.HasSSE41
 }
 //go:noescape
 func mixBlocksSSE2(out, a, b, c *block)
 //go:noescape
 func xorBlocksSSE2(out, a, b, c *block)
 //go:noescape
 func blamkaSSE4(b *block)
 func processBlockSSE(out, in1, in2 *block, xor bool) {
 	var t block
 	mixBlocksSSE2(&t, in1, in2, &t)
 	if useSSE4 {
 		blamkaSSE4(&t)
 	} else {
 		for i := 0; i < blockLength; i += 16 {
 			blamkaGeneric(
 				&t[i+0], &t[i+1], &t[i+2], &t[i+3],
 				&t[i+4], &t[i+5], &t[i+6], &t[i+7],
 				&t[i+8], &t[i+9], &t[i+10], &t[i+11],
 				&t[i+12], &t[i+13], &t[i+14], &t[i+15],
 			)
 		}
 		for i := 0; i < blockLength/8; i += 2 {
 			blamkaGeneric(
 				&t[i], &t[i+1], &t[16+i], &t[16+i+1],
 				&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
 				&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
 				&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
 			)
 		}
 	}
 	if xor {
 		xorBlocksSSE2(out, in1, in2, &t)
 	} else {
 		mixBlocksSSE2(out, in1, in2, &t)
 	}
 }
 func processBlock(out, in1, in2 *block) {
 	processBlockSSE(out, in1, in2, false)
 }
 func processBlockXOR(out, in1, in2 *block) {
 	processBlockSSE(out, in1, in2, true)
 }
--- a/vendor/golang.org/x/crypto/argon2/blamka_amd64.s
+++ b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s
@ -0,0 +1,243 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build amd64,!gccgo,!appengine
 #include "textflag.h"
 DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
 GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
 GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
 	MOVO       v4, t1; \
 	MOVO       v5, v4; \
 	MOVO       t1, v5; \
 	MOVO       v6, t1; \
 	PUNPCKLQDQ v6, t2; \
 	PUNPCKHQDQ v7, v6; \
 	PUNPCKHQDQ t2, v6; \
 	PUNPCKLQDQ v7, t2; \
 	MOVO       t1, v7; \
 	MOVO       v2, t1; \
 	PUNPCKHQDQ t2, v7; \
 	PUNPCKLQDQ v3, t2; \
 	PUNPCKHQDQ t2, v2; \
 	PUNPCKLQDQ t1, t2; \
 	PUNPCKHQDQ t2, v3
 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
 	MOVO       v4, t1; \
 	MOVO       v5, v4; \
 	MOVO       t1, v5; \
 	MOVO       v2, t1; \
 	PUNPCKLQDQ v2, t2; \
 	PUNPCKHQDQ v3, v2; \
 	PUNPCKHQDQ t2, v2; \
 	PUNPCKLQDQ v3, t2; \
 	MOVO       t1, v3; \
 	MOVO       v6, t1; \
 	PUNPCKHQDQ t2, v3; \
 	PUNPCKLQDQ v7, t2; \
 	PUNPCKHQDQ t2, v6; \
 	PUNPCKLQDQ t1, t2; \
 	PUNPCKHQDQ t2, v7
 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
 	MOVO    v0, t0;        \
 	PMULULQ v2, t0;        \
 	PADDQ   v2, v0;        \
 	PADDQ   t0, v0;        \
 	PADDQ   t0, v0;        \
 	PXOR    v0, v6;        \
 	PSHUFD  $0xB1, v6, v6; \
 	MOVO    v4, t0;        \
 	PMULULQ v6, t0;        \
 	PADDQ   v6, v4;        \
 	PADDQ   t0, v4;        \
 	PADDQ   t0, v4;        \
 	PXOR    v4, v2;        \
 	PSHUFB  c40, v2;       \
 	MOVO    v0, t0;        \
 	PMULULQ v2, t0;        \
 	PADDQ   v2, v0;        \
 	PADDQ   t0, v0;        \
 	PADDQ   t0, v0;        \
 	PXOR    v0, v6;        \
 	PSHUFB  c48, v6;       \
 	MOVO    v4, t0;        \
 	PMULULQ v6, t0;        \
 	PADDQ   v6, v4;        \
 	PADDQ   t0, v4;        \
 	PADDQ   t0, v4;        \
 	PXOR    v4, v2;        \
 	MOVO    v2, t0;        \
 	PADDQ   v2, t0;        \
 	PSRLQ   $63, v2;       \
 	PXOR    t0, v2;        \
 	MOVO    v1, t0;        \
 	PMULULQ v3, t0;        \
 	PADDQ   v3, v1;        \
 	PADDQ   t0, v1;        \
 	PADDQ   t0, v1;        \
 	PXOR    v1, v7;        \
 	PSHUFD  $0xB1, v7, v7; \
 	MOVO    v5, t0;        \
 	PMULULQ v7, t0;        \
 	PADDQ   v7, v5;        \
 	PADDQ   t0, v5;        \
 	PADDQ   t0, v5;        \
 	PXOR    v5, v3;        \
 	PSHUFB  c40, v3;       \
 	MOVO    v1, t0;        \
 	PMULULQ v3, t0;        \
 	PADDQ   v3, v1;        \
 	PADDQ   t0, v1;        \
 	PADDQ   t0, v1;        \
 	PXOR    v1, v7;        \
 	PSHUFB  c48, v7;       \
 	MOVO    v5, t0;        \
 	PMULULQ v7, t0;        \
 	PADDQ   v7, v5;        \
 	PADDQ   t0, v5;        \
 	PADDQ   t0, v5;        \
 	PXOR    v5, v3;        \
 	MOVO    v3, t0;        \
 	PADDQ   v3, t0;        \
 	PSRLQ   $63, v3;       \
 	PXOR    t0, v3
 #define LOAD_MSG_0(block, off) \
 	MOVOU 8*(off+0)(block), X0;  \
 	MOVOU 8*(off+2)(block), X1;  \
 	MOVOU 8*(off+4)(block), X2;  \
 	MOVOU 8*(off+6)(block), X3;  \
 	MOVOU 8*(off+8)(block), X4;  \
 	MOVOU 8*(off+10)(block), X5; \
 	MOVOU 8*(off+12)(block), X6; \
 	MOVOU 8*(off+14)(block), X7
 #define STORE_MSG_0(block, off) \
 	MOVOU X0, 8*(off+0)(block);  \
 	MOVOU X1, 8*(off+2)(block);  \
 	MOVOU X2, 8*(off+4)(block);  \
 	MOVOU X3, 8*(off+6)(block);  \
 	MOVOU X4, 8*(off+8)(block);  \
 	MOVOU X5, 8*(off+10)(block); \
 	MOVOU X6, 8*(off+12)(block); \
 	MOVOU X7, 8*(off+14)(block)
 #define LOAD_MSG_1(block, off) \
 	MOVOU 8*off+0*8(block), X0;  \
 	MOVOU 8*off+16*8(block), X1; \
 	MOVOU 8*off+32*8(block), X2; \
 	MOVOU 8*off+48*8(block), X3; \
 	MOVOU 8*off+64*8(block), X4; \
 	MOVOU 8*off+80*8(block), X5; \
 	MOVOU 8*off+96*8(block), X6; \
 	MOVOU 8*off+112*8(block), X7
 #define STORE_MSG_1(block, off) \
 	MOVOU X0, 8*off+0*8(block);  \
 	MOVOU X1, 8*off+16*8(block); \
 	MOVOU X2, 8*off+32*8(block); \
 	MOVOU X3, 8*off+48*8(block); \
 	MOVOU X4, 8*off+64*8(block); \
 	MOVOU X5, 8*off+80*8(block); \
 	MOVOU X6, 8*off+96*8(block); \
 	MOVOU X7, 8*off+112*8(block)
 #define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
 	LOAD_MSG_0(block, off);                                   \
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
 	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
 	STORE_MSG_0(block, off)
 #define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
 	LOAD_MSG_1(block, off);                                   \
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
 	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
 	STORE_MSG_1(block, off)
 // func blamkaSSE4(b *block)
 TEXT ·blamkaSSE4(SB), 4, $0-8
 	MOVQ b+0(FP), AX
 	MOVOU ·c40<>(SB), X10
 	MOVOU ·c48<>(SB), X11
 	BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
 	BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
 	BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
 	BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
 	BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
 	BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
 	BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
 	BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
 	BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
 	BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
 	BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
 	BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
 	BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
 	BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
 	BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
 	BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
 	RET
 // func mixBlocksSSE2(out, a, b, c *block)
 TEXT ·mixBlocksSSE2(SB), 4, $0-32
 	MOVQ out+0(FP), DX
 	MOVQ a+8(FP), AX
 	MOVQ b+16(FP), BX
 	MOVQ a+24(FP), CX
 	MOVQ $128, BP
 loop:
 	MOVOU 0(AX), X0
 	MOVOU 0(BX), X1
 	MOVOU 0(CX), X2
 	PXOR  X1, X0
 	PXOR  X2, X0
 	MOVOU X0, 0(DX)
 	ADDQ  $16, AX
 	ADDQ  $16, BX
 	ADDQ  $16, CX
 	ADDQ  $16, DX
 	SUBQ  $2, BP
 	JA    loop
 	RET
 // func xorBlocksSSE2(out, a, b, c *block)
 TEXT ·xorBlocksSSE2(SB), 4, $0-32
 	MOVQ out+0(FP), DX
 	MOVQ a+8(FP), AX
 	MOVQ b+16(FP), BX
 	MOVQ a+24(FP), CX
 	MOVQ $128, BP
 loop:
 	MOVOU 0(AX), X0
 	MOVOU 0(BX), X1
 	MOVOU 0(CX), X2
 	MOVOU 0(DX), X3
 	PXOR  X1, X0
 	PXOR  X2, X0
 	PXOR  X3, X0
 	MOVOU X0, 0(DX)
 	ADDQ  $16, AX
 	ADDQ  $16, BX
 	ADDQ  $16, CX
 	ADDQ  $16, DX
 	SUBQ  $2, BP
 	JA    loop
 	RET
--- a/vendor/golang.org/x/crypto/argon2/blamka_generic.go
+++ b/vendor/golang.org/x/crypto/argon2/blamka_generic.go
@ -0,0 +1,163 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package argon2
 var useSSE4 bool
 func processBlockGeneric(out, in1, in2 *block, xor bool) {
 	var t block
 	for i := range t {
 		t[i] = in1[i] ^ in2[i]
 	}
 	for i := 0; i < blockLength; i += 16 {
 		blamkaGeneric(
 			&t[i+0], &t[i+1], &t[i+2], &t[i+3],
 			&t[i+4], &t[i+5], &t[i+6], &t[i+7],
 			&t[i+8], &t[i+9], &t[i+10], &t[i+11],
 			&t[i+12], &t[i+13], &t[i+14], &t[i+15],
 		)
 	}
 	for i := 0; i < blockLength/8; i += 2 {
 		blamkaGeneric(
 			&t[i], &t[i+1], &t[16+i], &t[16+i+1],
 			&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
 			&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
 			&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
 		)
 	}
 	if xor {
 		for i := range t {
 			out[i] ^= in1[i] ^ in2[i] ^ t[i]
 		}
 	} else {
 		for i := range t {
 			out[i] = in1[i] ^ in2[i] ^ t[i]
 		}
 	}
 }
 func blamkaGeneric(t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15 *uint64) {
 	v00, v01, v02, v03 := *t00, *t01, *t02, *t03
 	v04, v05, v06, v07 := *t04, *t05, *t06, *t07
 	v08, v09, v10, v11 := *t08, *t09, *t10, *t11
 	v12, v13, v14, v15 := *t12, *t13, *t14, *t15
 	v00 += v04 + 2*uint64(uint32(v00))*uint64(uint32(v04))
 	v12 ^= v00
 	v12 = v12>>32 | v12<<32
 	v08 += v12 + 2*uint64(uint32(v08))*uint64(uint32(v12))
 	v04 ^= v08
 	v04 = v04>>24 | v04<<40
 	v00 += v04 + 2*uint64(uint32(v00))*uint64(uint32(v04))
 	v12 ^= v00
 	v12 = v12>>16 | v12<<48
 	v08 += v12 + 2*uint64(uint32(v08))*uint64(uint32(v12))
 	v04 ^= v08
 	v04 = v04>>63 | v04<<1
 	v01 += v05 + 2*uint64(uint32(v01))*uint64(uint32(v05))
 	v13 ^= v01
 	v13 = v13>>32 | v13<<32
 	v09 += v13 + 2*uint64(uint32(v09))*uint64(uint32(v13))
 	v05 ^= v09
 	v05 = v05>>24 | v05<<40
 	v01 += v05 + 2*uint64(uint32(v01))*uint64(uint32(v05))
 	v13 ^= v01
 	v13 = v13>>16 | v13<<48
 	v09 += v13 + 2*uint64(uint32(v09))*uint64(uint32(v13))
 	v05 ^= v09
 	v05 = v05>>63 | v05<<1
 	v02 += v06 + 2*uint64(uint32(v02))*uint64(uint32(v06))
 	v14 ^= v02
 	v14 = v14>>32 | v14<<32
 	v10 += v14 + 2*uint64(uint32(v10))*uint64(uint32(v14))
 	v06 ^= v10
 	v06 = v06>>24 | v06<<40
 	v02 += v06 + 2*uint64(uint32(v02))*uint64(uint32(v06))
 	v14 ^= v02
 	v14 = v14>>16 | v14<<48
 	v10 += v14 + 2*uint64(uint32(v10))*uint64(uint32(v14))
 	v06 ^= v10
 	v06 = v06>>63 | v06<<1
 	v03 += v07 + 2*uint64(uint32(v03))*uint64(uint32(v07))
 	v15 ^= v03
 	v15 = v15>>32 | v15<<32
 	v11 += v15 + 2*uint64(uint32(v11))*uint64(uint32(v15))
 	v07 ^= v11
 	v07 = v07>>24 | v07<<40
 	v03 += v07 + 2*uint64(uint32(v03))*uint64(uint32(v07))
 	v15 ^= v03
 	v15 = v15>>16 | v15<<48
 	v11 += v15 + 2*uint64(uint32(v11))*uint64(uint32(v15))
 	v07 ^= v11
 	v07 = v07>>63 | v07<<1
 	v00 += v05 + 2*uint64(uint32(v00))*uint64(uint32(v05))
 	v15 ^= v00
 	v15 = v15>>32 | v15<<32
 	v10 += v15 + 2*uint64(uint32(v10))*uint64(uint32(v15))
 	v05 ^= v10
 	v05 = v05>>24 | v05<<40
 	v00 += v05 + 2*uint64(uint32(v00))*uint64(uint32(v05))
 	v15 ^= v00
 	v15 = v15>>16 | v15<<48
 	v10 += v15 + 2*uint64(uint32(v10))*uint64(uint32(v15))
 	v05 ^= v10
 	v05 = v05>>63 | v05<<1
 	v01 += v06 + 2*uint64(uint32(v01))*uint64(uint32(v06))
 	v12 ^= v01
 	v12 = v12>>32 | v12<<32
 	v11 += v12 + 2*uint64(uint32(v11))*uint64(uint32(v12))
 	v06 ^= v11
 	v06 = v06>>24 | v06<<40
 	v01 += v06 + 2*uint64(uint32(v01))*uint64(uint32(v06))
 	v12 ^= v01
 	v12 = v12>>16 | v12<<48
 	v11 += v12 + 2*uint64(uint32(v11))*uint64(uint32(v12))
 	v06 ^= v11
 	v06 = v06>>63 | v06<<1
 	v02 += v07 + 2*uint64(uint32(v02))*uint64(uint32(v07))
 	v13 ^= v02
 	v13 = v13>>32 | v13<<32
 	v08 += v13 + 2*uint64(uint32(v08))*uint64(uint32(v13))
 	v07 ^= v08
 	v07 = v07>>24 | v07<<40
 	v02 += v07 + 2*uint64(uint32(v02))*uint64(uint32(v07))
 	v13 ^= v02
 	v13 = v13>>16 | v13<<48
 	v08 += v13 + 2*uint64(uint32(v08))*uint64(uint32(v13))
 	v07 ^= v08
 	v07 = v07>>63 | v07<<1
 	v03 += v04 + 2*uint64(uint32(v03))*uint64(uint32(v04))
 	v14 ^= v03
 	v14 = v14>>32 | v14<<32
 	v09 += v14 + 2*uint64(uint32(v09))*uint64(uint32(v14))
 	v04 ^= v09
 	v04 = v04>>24 | v04<<40
 	v03 += v04 + 2*uint64(uint32(v03))*uint64(uint32(v04))
 	v14 ^= v03
 	v14 = v14>>16 | v14<<48
 	v09 += v14 + 2*uint64(uint32(v09))*uint64(uint32(v14))
 	v04 ^= v09
 	v04 = v04>>63 | v04<<1
 	*t00, *t01, *t02, *t03 = v00, v01, v02, v03
 	*t04, *t05, *t06, *t07 = v04, v05, v06, v07
 	*t08, *t09, *t10, *t11 = v08, v09, v10, v11
 	*t12, *t13, *t14, *t15 = v12, v13, v14, v15
 }
--- a/vendor/golang.org/x/crypto/argon2/blamka_ref.go
+++ b/vendor/golang.org/x/crypto/argon2/blamka_ref.go
@ -0,0 +1,15 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64 appengine gccgo
 package argon2
 func processBlock(out, in1, in2 *block) {
 	processBlockGeneric(out, in1, in2, false)
 }
 func processBlockXOR(out, in1, in2 *block) {
 	processBlockGeneric(out, in1, in2, true)
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2b.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b.go
@ -0,0 +1,289 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package blake2b implements the BLAKE2b hash algorithm defined by RFC 7693
 // and the extendable output function (XOF) BLAKE2Xb.
 //
 // For a detailed specification of BLAKE2b see https://blake2.net/blake2.pdf
 // and for BLAKE2Xb see https://blake2.net/blake2x.pdf
 //
 // If you aren't sure which function you need, use BLAKE2b (Sum512 or New512).
 // If you need a secret-key MAC (message authentication code), use the New512
 // function with a non-nil key.
 //
 // BLAKE2X is a construction to compute hash values larger than 64 bytes. It
 // can produce hash values between 0 and 4 GiB.
 package blake2b
 import (
 	"encoding/binary"
 	"errors"
 	"hash"
 )
 const (
 	// The blocksize of BLAKE2b in bytes.
 	BlockSize = 128
 	// The hash size of BLAKE2b-512 in bytes.
 	Size = 64
 	// The hash size of BLAKE2b-384 in bytes.
 	Size384 = 48
 	// The hash size of BLAKE2b-256 in bytes.
 	Size256 = 32
 )
 var (
 	useAVX2 bool
 	useAVX  bool
 	useSSE4 bool
 )
 var (
 	errKeySize  = errors.New("blake2b: invalid key size")
 	errHashSize = errors.New("blake2b: invalid hash size")
 )
 var iv = [8]uint64{
 	0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
 	0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 }
 // Sum512 returns the BLAKE2b-512 checksum of the data.
 func Sum512(data []byte) [Size]byte {
 	var sum [Size]byte
 	checkSum(&sum, Size, data)
 	return sum
 }
 // Sum384 returns the BLAKE2b-384 checksum of the data.
 func Sum384(data []byte) [Size384]byte {
 	var sum [Size]byte
 	var sum384 [Size384]byte
 	checkSum(&sum, Size384, data)
 	copy(sum384[:], sum[:Size384])
 	return sum384
 }
 // Sum256 returns the BLAKE2b-256 checksum of the data.
 func Sum256(data []byte) [Size256]byte {
 	var sum [Size]byte
 	var sum256 [Size256]byte
 	checkSum(&sum, Size256, data)
 	copy(sum256[:], sum[:Size256])
 	return sum256
 }
 // New512 returns a new hash.Hash computing the BLAKE2b-512 checksum. A non-nil
 // key turns the hash into a MAC. The key must be between zero and 64 bytes long.
 func New512(key []byte) (hash.Hash, error) { return newDigest(Size, key) }
 // New384 returns a new hash.Hash computing the BLAKE2b-384 checksum. A non-nil
 // key turns the hash into a MAC. The key must be between zero and 64 bytes long.
 func New384(key []byte) (hash.Hash, error) { return newDigest(Size384, key) }
 // New256 returns a new hash.Hash computing the BLAKE2b-256 checksum. A non-nil
 // key turns the hash into a MAC. The key must be between zero and 64 bytes long.
 func New256(key []byte) (hash.Hash, error) { return newDigest(Size256, key) }
 // New returns a new hash.Hash computing the BLAKE2b checksum with a custom length.
 // A non-nil key turns the hash into a MAC. The key must be between zero and 64 bytes long.
 // The hash size can be a value between 1 and 64 but it is highly recommended to use
 // values equal or greater than:
 // - 32 if BLAKE2b is used as a hash function (The key is zero bytes long).
 // - 16 if BLAKE2b is used as a MAC function (The key is at least 16 bytes long).
 // When the key is nil, the returned hash.Hash implements BinaryMarshaler
 // and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash.
 func New(size int, key []byte) (hash.Hash, error) { return newDigest(size, key) }
 func newDigest(hashSize int, key []byte) (*digest, error) {
 	if hashSize < 1 || hashSize > Size {
 		return nil, errHashSize
 	}
 	if len(key) > Size {
 		return nil, errKeySize
 	}
 	d := &digest{
 		size:   hashSize,
 		keyLen: len(key),
 	}
 	copy(d.key[:], key)
 	d.Reset()
 	return d, nil
 }
 func checkSum(sum *[Size]byte, hashSize int, data []byte) {
 	h := iv
 	h[0] ^= uint64(hashSize) | (1 << 16) | (1 << 24)
 	var c [2]uint64
 	if length := len(data); length > BlockSize {
 		n := length &^ (BlockSize - 1)
 		if length == n {
 			n -= BlockSize
 		}
 		hashBlocks(&h, &c, 0, data[:n])
 		data = data[n:]
 	}
 	var block [BlockSize]byte
 	offset := copy(block[:], data)
 	remaining := uint64(BlockSize - offset)
 	if c[0] < remaining {
 		c[1]--
 	}
 	c[0] -= remaining
 	hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
 	for i, v := range h[:(hashSize+7)/8] {
 		binary.LittleEndian.PutUint64(sum[8*i:], v)
 	}
 }
 type digest struct {
 	h      [8]uint64
 	c      [2]uint64
 	size   int
 	block  [BlockSize]byte
 	offset int
 	key    [BlockSize]byte
 	keyLen int
 }
 const (
 	magic         = "b2b"
 	marshaledSize = len(magic) + 8*8 + 2*8 + 1 + BlockSize + 1
 )
 func (d *digest) MarshalBinary() ([]byte, error) {
 	if d.keyLen != 0 {
 		return nil, errors.New("crypto/blake2b: cannot marshal MACs")
 	}
 	b := make([]byte, 0, marshaledSize)
 	b = append(b, magic...)
 	for i := 0; i < 8; i++ {
 		b = appendUint64(b, d.h[i])
 	}
 	b = appendUint64(b, d.c[0])
 	b = appendUint64(b, d.c[1])
 	// Maximum value for size is 64
 	b = append(b, byte(d.size))
 	b = append(b, d.block[:]...)
 	b = append(b, byte(d.offset))
 	return b, nil
 }
 func (d *digest) UnmarshalBinary(b []byte) error {
 	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
 		return errors.New("crypto/blake2b: invalid hash state identifier")
 	}
 	if len(b) != marshaledSize {
 		return errors.New("crypto/blake2b: invalid hash state size")
 	}
 	b = b[len(magic):]
 	for i := 0; i < 8; i++ {
 		b, d.h[i] = consumeUint64(b)
 	}
 	b, d.c[0] = consumeUint64(b)
 	b, d.c[1] = consumeUint64(b)
 	d.size = int(b[0])
 	b = b[1:]
 	copy(d.block[:], b[:BlockSize])
 	b = b[BlockSize:]
 	d.offset = int(b[0])
 	return nil
 }
 func (d *digest) BlockSize() int { return BlockSize }
 func (d *digest) Size() int { return d.size }
 func (d *digest) Reset() {
 	d.h = iv
 	d.h[0] ^= uint64(d.size) | (uint64(d.keyLen) << 8) | (1 << 16) | (1 << 24)
 	d.offset, d.c[0], d.c[1] = 0, 0, 0
 	if d.keyLen > 0 {
 		d.block = d.key
 		d.offset = BlockSize
 	}
 }
 func (d *digest) Write(p []byte) (n int, err error) {
 	n = len(p)
 	if d.offset > 0 {
 		remaining := BlockSize - d.offset
 		if n <= remaining {
 			d.offset += copy(d.block[d.offset:], p)
 			return
 		}
 		copy(d.block[d.offset:], p[:remaining])
 		hashBlocks(&d.h, &d.c, 0, d.block[:])
 		d.offset = 0
 		p = p[remaining:]
 	}
 	if length := len(p); length > BlockSize {
 		nn := length &^ (BlockSize - 1)
 		if length == nn {
 			nn -= BlockSize
 		}
 		hashBlocks(&d.h, &d.c, 0, p[:nn])
 		p = p[nn:]
 	}
 	if len(p) > 0 {
 		d.offset += copy(d.block[:], p)
 	}
 	return
 }
 func (d *digest) Sum(sum []byte) []byte {
 	var hash [Size]byte
 	d.finalize(&hash)
 	return append(sum, hash[:d.size]...)
 }
 func (d *digest) finalize(hash *[Size]byte) {
 	var block [BlockSize]byte
 	copy(block[:], d.block[:d.offset])
 	remaining := uint64(BlockSize - d.offset)
 	c := d.c
 	if c[0] < remaining {
 		c[1]--
 	}
 	c[0] -= remaining
 	h := d.h
 	hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
 	for i, v := range h {
 		binary.LittleEndian.PutUint64(hash[8*i:], v)
 	}
 }
 func appendUint64(b []byte, x uint64) []byte {
 	var a [8]byte
 	binary.BigEndian.PutUint64(a[:], x)
 	return append(b, a[:]...)
 }
 func appendUint32(b []byte, x uint32) []byte {
 	var a [4]byte
 	binary.BigEndian.PutUint32(a[:], x)
 	return append(b, a[:]...)
 }
 func consumeUint64(b []byte) ([]byte, uint64) {
 	x := binary.BigEndian.Uint64(b)
 	return b[8:], x
 }
 func consumeUint32(b []byte) ([]byte, uint32) {
 	x := binary.BigEndian.Uint32(b)
 	return b[4:], x
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go
@ -0,0 +1,37 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.7,amd64,!gccgo,!appengine
 package blake2b
 import "golang.org/x/sys/cpu"
 func init() {
 	useAVX2 = cpu.X86.HasAVX2
 	useAVX = cpu.X86.HasAVX
 	useSSE4 = cpu.X86.HasSSE41
 }
 //go:noescape
 func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 //go:noescape
 func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 //go:noescape
 func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
 	switch {
 	case useAVX2:
 		hashBlocksAVX2(h, c, flag, blocks)
 	case useAVX:
 		hashBlocksAVX(h, c, flag, blocks)
 	case useSSE4:
 		hashBlocksSSE4(h, c, flag, blocks)
 	default:
 		hashBlocksGeneric(h, c, flag, blocks)
 	}
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
@ -0,0 +1,750 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.7,amd64,!gccgo,!appengine
 #include "textflag.h"
 DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
 DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
 DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
 DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
 GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
 DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
 DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
 DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
 DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
 GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
 DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
 DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
 DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
 DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
 GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
 DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
 DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
 DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
 DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
 GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
 DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
 DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
 GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
 DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
 GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
 DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
 GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
 DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
 GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
 DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
 GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
 DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
 GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
 #define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
 #define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
 #define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
 #define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
 #define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
 #define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
 	VPADDQ  m0, Y0, Y0;   \
 	VPADDQ  Y1, Y0, Y0;   \
 	VPXOR   Y0, Y3, Y3;   \
 	VPSHUFD $-79, Y3, Y3; \
 	VPADDQ  Y3, Y2, Y2;   \
 	VPXOR   Y2, Y1, Y1;   \
 	VPSHUFB c40, Y1, Y1;  \
 	VPADDQ  m1, Y0, Y0;   \
 	VPADDQ  Y1, Y0, Y0;   \
 	VPXOR   Y0, Y3, Y3;   \
 	VPSHUFB c48, Y3, Y3;  \
 	VPADDQ  Y3, Y2, Y2;   \
 	VPXOR   Y2, Y1, Y1;   \
 	VPADDQ  Y1, Y1, t;    \
 	VPSRLQ  $63, Y1, Y1;  \
 	VPXOR   t, Y1, Y1;    \
 	VPERMQ_0x39_Y1_Y1;    \
 	VPERMQ_0x4E_Y2_Y2;    \
 	VPERMQ_0x93_Y3_Y3;    \
 	VPADDQ  m2, Y0, Y0;   \
 	VPADDQ  Y1, Y0, Y0;   \
 	VPXOR   Y0, Y3, Y3;   \
 	VPSHUFD $-79, Y3, Y3; \
 	VPADDQ  Y3, Y2, Y2;   \
 	VPXOR   Y2, Y1, Y1;   \
 	VPSHUFB c40, Y1, Y1;  \
 	VPADDQ  m3, Y0, Y0;   \
 	VPADDQ  Y1, Y0, Y0;   \
 	VPXOR   Y0, Y3, Y3;   \
 	VPSHUFB c48, Y3, Y3;  \
 	VPADDQ  Y3, Y2, Y2;   \
 	VPXOR   Y2, Y1, Y1;   \
 	VPADDQ  Y1, Y1, t;    \
 	VPSRLQ  $63, Y1, Y1;  \
 	VPXOR   t, Y1, Y1;    \
 	VPERMQ_0x39_Y3_Y3;    \
 	VPERMQ_0x4E_Y2_Y2;    \
 	VPERMQ_0x93_Y1_Y1
 #define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
 #define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
 #define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
 #define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
 #define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
 #define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
 #define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
 #define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
 #define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
 #define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
 #define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
 #define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
 #define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
 #define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
 #define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
 #define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
 #define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
 #define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
 #define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
 #define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
 #define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
 #define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
 // load msg: Y12 = (i0, i1, i2, i3)
 // i0, i1, i2, i3 must not be 0
 #define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
 	VMOVQ_SI_X12(i0*8);           \
 	VMOVQ_SI_X11(i2*8);           \
 	VPINSRQ_1_SI_X12(i1*8);       \
 	VPINSRQ_1_SI_X11(i3*8);       \
 	VINSERTI128 $1, X11, Y12, Y12
 // load msg: Y13 = (i0, i1, i2, i3)
 // i0, i1, i2, i3 must not be 0
 #define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
 	VMOVQ_SI_X13(i0*8);           \
 	VMOVQ_SI_X11(i2*8);           \
 	VPINSRQ_1_SI_X13(i1*8);       \
 	VPINSRQ_1_SI_X11(i3*8);       \
 	VINSERTI128 $1, X11, Y13, Y13
 // load msg: Y14 = (i0, i1, i2, i3)
 // i0, i1, i2, i3 must not be 0
 #define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
 	VMOVQ_SI_X14(i0*8);           \
 	VMOVQ_SI_X11(i2*8);           \
 	VPINSRQ_1_SI_X14(i1*8);       \
 	VPINSRQ_1_SI_X11(i3*8);       \
 	VINSERTI128 $1, X11, Y14, Y14
 // load msg: Y15 = (i0, i1, i2, i3)
 // i0, i1, i2, i3 must not be 0
 #define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
 	VMOVQ_SI_X15(i0*8);           \
 	VMOVQ_SI_X11(i2*8);           \
 	VPINSRQ_1_SI_X15(i1*8);       \
 	VPINSRQ_1_SI_X11(i3*8);       \
 	VINSERTI128 $1, X11, Y15, Y15
 #define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
 	VMOVQ_SI_X12_0;                   \
 	VMOVQ_SI_X11(4*8);                \
 	VPINSRQ_1_SI_X12(2*8);            \
 	VPINSRQ_1_SI_X11(6*8);            \
 	VINSERTI128 $1, X11, Y12, Y12;    \
 	LOAD_MSG_AVX2_Y13(1, 3, 5, 7);    \
 	LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
 	LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
 #define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
 	LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
 	LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
 	VMOVQ_SI_X11(11*8);              \
 	VPSHUFD     $0x4E, 0*8(SI), X14; \
 	VPINSRQ_1_SI_X11(5*8);           \
 	VINSERTI128 $1, X11, Y14, Y14;   \
 	LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
 #define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
 	VMOVQ_SI_X11(5*8);              \
 	VMOVDQU     11*8(SI), X12;      \
 	VPINSRQ_1_SI_X11(15*8);         \
 	VINSERTI128 $1, X11, Y12, Y12;  \
 	VMOVQ_SI_X13(8*8);              \
 	VMOVQ_SI_X11(2*8);              \
 	VPINSRQ_1_SI_X13_0;             \
 	VPINSRQ_1_SI_X11(13*8);         \
 	VINSERTI128 $1, X11, Y13, Y13;  \
 	LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
 	LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
 #define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
 	LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
 	LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
 	LOAD_MSG_AVX2_Y14(2, 5, 4, 15);  \
 	VMOVQ_SI_X15(6*8);               \
 	VMOVQ_SI_X11_0;                  \
 	VPINSRQ_1_SI_X15(10*8);          \
 	VPINSRQ_1_SI_X11(8*8);           \
 	VINSERTI128 $1, X11, Y15, Y15
 #define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
 	LOAD_MSG_AVX2_Y12(9, 5, 2, 10);  \
 	VMOVQ_SI_X13_0;                  \
 	VMOVQ_SI_X11(4*8);               \
 	VPINSRQ_1_SI_X13(7*8);           \
 	VPINSRQ_1_SI_X11(15*8);          \
 	VINSERTI128 $1, X11, Y13, Y13;   \
 	LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
 	LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
 #define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
 	VMOVQ_SI_X12(2*8);                \
 	VMOVQ_SI_X11_0;                   \
 	VPINSRQ_1_SI_X12(6*8);            \
 	VPINSRQ_1_SI_X11(8*8);            \
 	VINSERTI128 $1, X11, Y12, Y12;    \
 	LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
 	LOAD_MSG_AVX2_Y14(4, 7, 15, 1);   \
 	LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
 #define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
 	LOAD_MSG_AVX2_Y12(12, 1, 14, 4);  \
 	LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
 	VMOVQ_SI_X14_0;                   \
 	VPSHUFD     $0x4E, 8*8(SI), X11;  \
 	VPINSRQ_1_SI_X14(6*8);            \
 	VINSERTI128 $1, X11, Y14, Y14;    \
 	LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
 #define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
 	LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
 	LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
 	LOAD_MSG_AVX2_Y14(5, 15, 8, 2);  \
 	VMOVQ_SI_X15_0;                  \
 	VMOVQ_SI_X11(6*8);               \
 	VPINSRQ_1_SI_X15(4*8);           \
 	VPINSRQ_1_SI_X11(10*8);          \
 	VINSERTI128 $1, X11, Y15, Y15
 #define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
 	VMOVQ_SI_X12(6*8);              \
 	VMOVQ_SI_X11(11*8);             \
 	VPINSRQ_1_SI_X12(14*8);         \
 	VPINSRQ_1_SI_X11_0;             \
 	VINSERTI128 $1, X11, Y12, Y12;  \
 	LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
 	VMOVQ_SI_X11(1*8);              \
 	VMOVDQU     12*8(SI), X14;      \
 	VPINSRQ_1_SI_X11(10*8);         \
 	VINSERTI128 $1, X11, Y14, Y14;  \
 	VMOVQ_SI_X15(2*8);              \
 	VMOVDQU     4*8(SI), X11;       \
 	VPINSRQ_1_SI_X15(7*8);          \
 	VINSERTI128 $1, X11, Y15, Y15
 #define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
 	LOAD_MSG_AVX2_Y12(10, 8, 7, 1);  \
 	VMOVQ_SI_X13(2*8);               \
 	VPSHUFD     $0x4E, 5*8(SI), X11; \
 	VPINSRQ_1_SI_X13(4*8);           \
 	VINSERTI128 $1, X11, Y13, Y13;   \
 	LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
 	VMOVQ_SI_X15(11*8);              \
 	VMOVQ_SI_X11(12*8);              \
 	VPINSRQ_1_SI_X15(14*8);          \
 	VPINSRQ_1_SI_X11_0;              \
 	VINSERTI128 $1, X11, Y15, Y15
 // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
 	MOVQ h+0(FP), AX
 	MOVQ c+8(FP), BX
 	MOVQ flag+16(FP), CX
 	MOVQ blocks_base+24(FP), SI
 	MOVQ blocks_len+32(FP), DI
 	MOVQ SP, DX
 	MOVQ SP, R9
 	ADDQ $31, R9
 	ANDQ $~31, R9
 	MOVQ R9, SP
 	MOVQ CX, 16(SP)
 	XORQ CX, CX
 	MOVQ CX, 24(SP)
 	VMOVDQU ·AVX2_c40<>(SB), Y4
 	VMOVDQU ·AVX2_c48<>(SB), Y5
 	VMOVDQU 0(AX), Y8
 	VMOVDQU 32(AX), Y9
 	VMOVDQU ·AVX2_iv0<>(SB), Y6
 	VMOVDQU ·AVX2_iv1<>(SB), Y7
 	MOVQ 0(BX), R8
 	MOVQ 8(BX), R9
 	MOVQ R9, 8(SP)
 loop:
 	ADDQ $128, R8
 	MOVQ R8, 0(SP)
 	CMPQ R8, $128
 	JGE  noinc
 	INCQ R9
 	MOVQ R9, 8(SP)
 noinc:
 	VMOVDQA Y8, Y0
 	VMOVDQA Y9, Y1
 	VMOVDQA Y6, Y2
 	VPXOR   0(SP), Y7, Y3
 	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
 	VMOVDQA Y12, 32(SP)
 	VMOVDQA Y13, 64(SP)
 	VMOVDQA Y14, 96(SP)
 	VMOVDQA Y15, 128(SP)
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
 	VMOVDQA Y12, 160(SP)
 	VMOVDQA Y13, 192(SP)
 	VMOVDQA Y14, 224(SP)
 	VMOVDQA Y15, 256(SP)
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	ROUND_AVX2(32(SP), 64(SP), 96(SP), 128(SP), Y10, Y4, Y5)
 	ROUND_AVX2(160(SP), 192(SP), 224(SP), 256(SP), Y10, Y4, Y5)
 	VPXOR Y0, Y8, Y8
 	VPXOR Y1, Y9, Y9
 	VPXOR Y2, Y8, Y8
 	VPXOR Y3, Y9, Y9
 	LEAQ 128(SI), SI
 	SUBQ $128, DI
 	JNE  loop
 	MOVQ R8, 0(BX)
 	MOVQ R9, 8(BX)
 	VMOVDQU Y8, 0(AX)
 	VMOVDQU Y9, 32(AX)
 	VZEROUPPER
 	MOVQ DX, SP
 	RET
 #define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
 #define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
 #define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
 #define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
 #define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
 #define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
 #define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
 #define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
 #define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
 #define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
 #define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
 #define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
 #define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
 #define SHUFFLE_AVX() \
 	VMOVDQA X6, X13;         \
 	VMOVDQA X2, X14;         \
 	VMOVDQA X4, X6;          \
 	VPUNPCKLQDQ_X13_X13_X15; \
 	VMOVDQA X5, X4;          \
 	VMOVDQA X6, X5;          \
 	VPUNPCKHQDQ_X15_X7_X6;   \
 	VPUNPCKLQDQ_X7_X7_X15;   \
 	VPUNPCKHQDQ_X15_X13_X7;  \
 	VPUNPCKLQDQ_X3_X3_X15;   \
 	VPUNPCKHQDQ_X15_X2_X2;   \
 	VPUNPCKLQDQ_X14_X14_X15; \
 	VPUNPCKHQDQ_X15_X3_X3;   \
 #define SHUFFLE_AVX_INV() \
 	VMOVDQA X2, X13;         \
 	VMOVDQA X4, X14;         \
 	VPUNPCKLQDQ_X2_X2_X15;   \
 	VMOVDQA X5, X4;          \
 	VPUNPCKHQDQ_X15_X3_X2;   \
 	VMOVDQA X14, X5;         \
 	VPUNPCKLQDQ_X3_X3_X15;   \
 	VMOVDQA X6, X14;         \
 	VPUNPCKHQDQ_X15_X13_X3;  \
 	VPUNPCKLQDQ_X7_X7_X15;   \
 	VPUNPCKHQDQ_X15_X6_X6;   \
 	VPUNPCKLQDQ_X14_X14_X15; \
 	VPUNPCKHQDQ_X15_X7_X7;   \
 #define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
 	VPADDQ  m0, v0, v0;   \
 	VPADDQ  v2, v0, v0;   \
 	VPADDQ  m1, v1, v1;   \
 	VPADDQ  v3, v1, v1;   \
 	VPXOR   v0, v6, v6;   \
 	VPXOR   v1, v7, v7;   \
 	VPSHUFD $-79, v6, v6; \
 	VPSHUFD $-79, v7, v7; \
 	VPADDQ  v6, v4, v4;   \
 	VPADDQ  v7, v5, v5;   \
 	VPXOR   v4, v2, v2;   \
 	VPXOR   v5, v3, v3;   \
 	VPSHUFB c40, v2, v2;  \
 	VPSHUFB c40, v3, v3;  \
 	VPADDQ  m2, v0, v0;   \
 	VPADDQ  v2, v0, v0;   \
 	VPADDQ  m3, v1, v1;   \
 	VPADDQ  v3, v1, v1;   \
 	VPXOR   v0, v6, v6;   \
 	VPXOR   v1, v7, v7;   \
 	VPSHUFB c48, v6, v6;  \
 	VPSHUFB c48, v7, v7;  \
 	VPADDQ  v6, v4, v4;   \
 	VPADDQ  v7, v5, v5;   \
 	VPXOR   v4, v2, v2;   \
 	VPXOR   v5, v3, v3;   \
 	VPADDQ  v2, v2, t0;   \
 	VPSRLQ  $63, v2, v2;  \
 	VPXOR   t0, v2, v2;   \
 	VPADDQ  v3, v3, t0;   \
 	VPSRLQ  $63, v3, v3;  \
 	VPXOR   t0, v3, v3
 // load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
 // i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
 #define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
 	VMOVQ_SI_X12(i0*8);     \
 	VMOVQ_SI_X13(i2*8);     \
 	VMOVQ_SI_X14(i4*8);     \
 	VMOVQ_SI_X15(i6*8);     \
 	VPINSRQ_1_SI_X12(i1*8); \
 	VPINSRQ_1_SI_X13(i3*8); \
 	VPINSRQ_1_SI_X14(i5*8); \
 	VPINSRQ_1_SI_X15(i7*8)
 // load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
 #define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
 	VMOVQ_SI_X12_0;        \
 	VMOVQ_SI_X13(4*8);     \
 	VMOVQ_SI_X14(1*8);     \
 	VMOVQ_SI_X15(5*8);     \
 	VPINSRQ_1_SI_X12(2*8); \
 	VPINSRQ_1_SI_X13(6*8); \
 	VPINSRQ_1_SI_X14(3*8); \
 	VPINSRQ_1_SI_X15(7*8)
 // load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
 #define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
 	VPSHUFD $0x4E, 0*8(SI), X12; \
 	VMOVQ_SI_X13(11*8);          \
 	VMOVQ_SI_X14(12*8);          \
 	VMOVQ_SI_X15(7*8);           \
 	VPINSRQ_1_SI_X13(5*8);       \
 	VPINSRQ_1_SI_X14(2*8);       \
 	VPINSRQ_1_SI_X15(3*8)
 // load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
 #define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
 	VMOVDQU 11*8(SI), X12;  \
 	VMOVQ_SI_X13(5*8);      \
 	VMOVQ_SI_X14(8*8);      \
 	VMOVQ_SI_X15(2*8);      \
 	VPINSRQ_1_SI_X13(15*8); \
 	VPINSRQ_1_SI_X14_0;     \
 	VPINSRQ_1_SI_X15(13*8)
 // load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
 #define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
 	VMOVQ_SI_X12(2*8);      \
 	VMOVQ_SI_X13(4*8);      \
 	VMOVQ_SI_X14(6*8);      \
 	VMOVQ_SI_X15_0;         \
 	VPINSRQ_1_SI_X12(5*8);  \
 	VPINSRQ_1_SI_X13(15*8); \
 	VPINSRQ_1_SI_X14(10*8); \
 	VPINSRQ_1_SI_X15(8*8)
 // load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
 #define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
 	VMOVQ_SI_X12(9*8);      \
 	VMOVQ_SI_X13(2*8);      \
 	VMOVQ_SI_X14_0;         \
 	VMOVQ_SI_X15(4*8);      \
 	VPINSRQ_1_SI_X12(5*8);  \
 	VPINSRQ_1_SI_X13(10*8); \
 	VPINSRQ_1_SI_X14(7*8);  \
 	VPINSRQ_1_SI_X15(15*8)
 // load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
 #define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
 	VMOVQ_SI_X12(2*8);      \
 	VMOVQ_SI_X13_0;         \
 	VMOVQ_SI_X14(12*8);     \
 	VMOVQ_SI_X15(11*8);     \
 	VPINSRQ_1_SI_X12(6*8);  \
 	VPINSRQ_1_SI_X13(8*8);  \
 	VPINSRQ_1_SI_X14(10*8); \
 	VPINSRQ_1_SI_X15(3*8)
 // load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
 #define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
 	MOVQ    0*8(SI), X12;        \
 	VPSHUFD $0x4E, 8*8(SI), X13; \
 	MOVQ    7*8(SI), X14;        \
 	MOVQ    2*8(SI), X15;        \
 	VPINSRQ_1_SI_X12(6*8);       \
 	VPINSRQ_1_SI_X14(3*8);       \
 	VPINSRQ_1_SI_X15(11*8)
 // load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
 #define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
 	MOVQ 6*8(SI), X12;      \
 	MOVQ 11*8(SI), X13;     \
 	MOVQ 15*8(SI), X14;     \
 	MOVQ 3*8(SI), X15;      \
 	VPINSRQ_1_SI_X12(14*8); \
 	VPINSRQ_1_SI_X13_0;     \
 	VPINSRQ_1_SI_X14(9*8);  \
 	VPINSRQ_1_SI_X15(8*8)
 // load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
 #define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
 	MOVQ 5*8(SI), X12;      \
 	MOVQ 8*8(SI), X13;      \
 	MOVQ 0*8(SI), X14;      \
 	MOVQ 6*8(SI), X15;      \
 	VPINSRQ_1_SI_X12(15*8); \
 	VPINSRQ_1_SI_X13(2*8);  \
 	VPINSRQ_1_SI_X14(4*8);  \
 	VPINSRQ_1_SI_X15(10*8)
 // load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
 #define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
 	VMOVDQU 12*8(SI), X12;  \
 	MOVQ    1*8(SI), X13;   \
 	MOVQ    2*8(SI), X14;   \
 	VPINSRQ_1_SI_X13(10*8); \
 	VPINSRQ_1_SI_X14(7*8);  \
 	VMOVDQU 4*8(SI), X15
 // load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
 #define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
 	MOVQ 15*8(SI), X12;     \
 	MOVQ 3*8(SI), X13;      \
 	MOVQ 11*8(SI), X14;     \
 	MOVQ 12*8(SI), X15;     \
 	VPINSRQ_1_SI_X12(9*8);  \
 	VPINSRQ_1_SI_X13(13*8); \
 	VPINSRQ_1_SI_X14(14*8); \
 	VPINSRQ_1_SI_X15_0
 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
 	MOVQ h+0(FP), AX
 	MOVQ c+8(FP), BX
 	MOVQ flag+16(FP), CX
 	MOVQ blocks_base+24(FP), SI
 	MOVQ blocks_len+32(FP), DI
 	MOVQ SP, BP
 	MOVQ SP, R9
 	ADDQ $15, R9
 	ANDQ $~15, R9
 	MOVQ R9, SP
 	VMOVDQU ·AVX_c40<>(SB), X0
 	VMOVDQU ·AVX_c48<>(SB), X1
 	VMOVDQA X0, X8
 	VMOVDQA X1, X9
 	VMOVDQU ·AVX_iv3<>(SB), X0
 	VMOVDQA X0, 0(SP)
 	XORQ    CX, 0(SP)          // 0(SP) = ·AVX_iv3 ^ (CX || 0)
 	VMOVDQU 0(AX), X10
 	VMOVDQU 16(AX), X11
 	VMOVDQU 32(AX), X2
 	VMOVDQU 48(AX), X3
 	MOVQ 0(BX), R8
 	MOVQ 8(BX), R9
 loop:
 	ADDQ $128, R8
 	CMPQ R8, $128
 	JGE  noinc
 	INCQ R9
 noinc:
 	VMOVQ_R8_X15
 	VPINSRQ_1_R9_X15
 	VMOVDQA X10, X0
 	VMOVDQA X11, X1
 	VMOVDQU ·AVX_iv0<>(SB), X4
 	VMOVDQU ·AVX_iv1<>(SB), X5
 	VMOVDQU ·AVX_iv2<>(SB), X6
 	VPXOR   X15, X6, X6
 	VMOVDQA 0(SP), X7
 	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
 	VMOVDQA X12, 16(SP)
 	VMOVDQA X13, 32(SP)
 	VMOVDQA X14, 48(SP)
 	VMOVDQA X15, 64(SP)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
 	VMOVDQA X12, 80(SP)
 	VMOVDQA X13, 96(SP)
 	VMOVDQA X14, 112(SP)
 	VMOVDQA X15, 128(SP)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
 	VMOVDQA X12, 144(SP)
 	VMOVDQA X13, 160(SP)
 	VMOVDQA X14, 176(SP)
 	VMOVDQA X15, 192(SP)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
 	VMOVDQA X12, 208(SP)
 	VMOVDQA X13, 224(SP)
 	VMOVDQA X14, 240(SP)
 	VMOVDQA X15, 256(SP)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X15, X8, X9)
 	SHUFFLE_AVX()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X15, X8, X9)
 	SHUFFLE_AVX()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	VMOVDQU 32(AX), X14
 	VMOVDQU 48(AX), X15
 	VPXOR   X0, X10, X10
 	VPXOR   X1, X11, X11
 	VPXOR   X2, X14, X14
 	VPXOR   X3, X15, X15
 	VPXOR   X4, X10, X10
 	VPXOR   X5, X11, X11
 	VPXOR   X6, X14, X2
 	VPXOR   X7, X15, X3
 	VMOVDQU X2, 32(AX)
 	VMOVDQU X3, 48(AX)
 	LEAQ 128(SI), SI
 	SUBQ $128, DI
 	JNE  loop
 	VMOVDQU X10, 0(AX)
 	VMOVDQU X11, 16(AX)
 	MOVQ R8, 0(BX)
 	MOVQ R9, 8(BX)
 	VZEROUPPER
 	MOVQ BP, SP
 	RET
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go
@ -0,0 +1,24 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !go1.7,amd64,!gccgo,!appengine
 package blake2b
 import "golang.org/x/sys/cpu"
 func init() {
 	useSSE4 = cpu.X86.HasSSE41
 }
 //go:noescape
 func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
 	if useSSE4 {
 		hashBlocksSSE4(h, c, flag, blocks)
 	} else {
 		hashBlocksGeneric(h, c, flag, blocks)
 	}
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
@ -0,0 +1,281 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build amd64,!gccgo,!appengine
 #include "textflag.h"
 DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
 DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
 GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
 DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
 DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
 GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
 DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
 DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
 GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
 DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
 DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
 GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
 DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
 GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
 GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
 	MOVO       v4, t1; \
 	MOVO       v5, v4; \
 	MOVO       t1, v5; \
 	MOVO       v6, t1; \
 	PUNPCKLQDQ v6, t2; \
 	PUNPCKHQDQ v7, v6; \
 	PUNPCKHQDQ t2, v6; \
 	PUNPCKLQDQ v7, t2; \
 	MOVO       t1, v7; \
 	MOVO       v2, t1; \
 	PUNPCKHQDQ t2, v7; \
 	PUNPCKLQDQ v3, t2; \
 	PUNPCKHQDQ t2, v2; \
 	PUNPCKLQDQ t1, t2; \
 	PUNPCKHQDQ t2, v3
 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
 	MOVO       v4, t1; \
 	MOVO       v5, v4; \
 	MOVO       t1, v5; \
 	MOVO       v2, t1; \
 	PUNPCKLQDQ v2, t2; \
 	PUNPCKHQDQ v3, v2; \
 	PUNPCKHQDQ t2, v2; \
 	PUNPCKLQDQ v3, t2; \
 	MOVO       t1, v3; \
 	MOVO       v6, t1; \
 	PUNPCKHQDQ t2, v3; \
 	PUNPCKLQDQ v7, t2; \
 	PUNPCKHQDQ t2, v6; \
 	PUNPCKLQDQ t1, t2; \
 	PUNPCKHQDQ t2, v7
 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
 	PADDQ  m0, v0;        \
 	PADDQ  m1, v1;        \
 	PADDQ  v2, v0;        \
 	PADDQ  v3, v1;        \
 	PXOR   v0, v6;        \
 	PXOR   v1, v7;        \
 	PSHUFD $0xB1, v6, v6; \
 	PSHUFD $0xB1, v7, v7; \
 	PADDQ  v6, v4;        \
 	PADDQ  v7, v5;        \
 	PXOR   v4, v2;        \
 	PXOR   v5, v3;        \
 	PSHUFB c40, v2;       \
 	PSHUFB c40, v3;       \
 	PADDQ  m2, v0;        \
 	PADDQ  m3, v1;        \
 	PADDQ  v2, v0;        \
 	PADDQ  v3, v1;        \
 	PXOR   v0, v6;        \
 	PXOR   v1, v7;        \
 	PSHUFB c48, v6;       \
 	PSHUFB c48, v7;       \
 	PADDQ  v6, v4;        \
 	PADDQ  v7, v5;        \
 	PXOR   v4, v2;        \
 	PXOR   v5, v3;        \
 	MOVOU  v2, t0;        \
 	PADDQ  v2, t0;        \
 	PSRLQ  $63, v2;       \
 	PXOR   t0, v2;        \
 	MOVOU  v3, t0;        \
 	PADDQ  v3, t0;        \
 	PSRLQ  $63, v3;       \
 	PXOR   t0, v3
 #define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
 	MOVQ   i0*8(src), m0;     \
 	PINSRQ $1, i1*8(src), m0; \
 	MOVQ   i2*8(src), m1;     \
 	PINSRQ $1, i3*8(src), m1; \
 	MOVQ   i4*8(src), m2;     \
 	PINSRQ $1, i5*8(src), m2; \
 	MOVQ   i6*8(src), m3;     \
 	PINSRQ $1, i7*8(src), m3
 // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
 	MOVQ h+0(FP), AX
 	MOVQ c+8(FP), BX
 	MOVQ flag+16(FP), CX
 	MOVQ blocks_base+24(FP), SI
 	MOVQ blocks_len+32(FP), DI
 	MOVQ SP, BP
 	MOVQ SP, R9
 	ADDQ $15, R9
 	ANDQ $~15, R9
 	MOVQ R9, SP
 	MOVOU ·iv3<>(SB), X0
 	MOVO  X0, 0(SP)
 	XORQ  CX, 0(SP)     // 0(SP) = ·iv3 ^ (CX || 0)
 	MOVOU ·c40<>(SB), X13
 	MOVOU ·c48<>(SB), X14
 	MOVOU 0(AX), X12
 	MOVOU 16(AX), X15
 	MOVQ 0(BX), R8
 	MOVQ 8(BX), R9
 loop:
 	ADDQ $128, R8
 	CMPQ R8, $128
 	JGE  noinc
 	INCQ R9
 noinc:
 	MOVQ R8, X8
 	PINSRQ $1, R9, X8
 	MOVO X12, X0
 	MOVO X15, X1
 	MOVOU 32(AX), X2
 	MOVOU 48(AX), X3
 	MOVOU ·iv0<>(SB), X4
 	MOVOU ·iv1<>(SB), X5
 	MOVOU ·iv2<>(SB), X6
 	PXOR X8, X6
 	MOVO 0(SP), X7
 	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
 	MOVO X8, 16(SP)
 	MOVO X9, 32(SP)
 	MOVO X10, 48(SP)
 	MOVO X11, 64(SP)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
 	MOVO X8, 80(SP)
 	MOVO X9, 96(SP)
 	MOVO X10, 112(SP)
 	MOVO X11, 128(SP)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
 	MOVO X8, 144(SP)
 	MOVO X9, 160(SP)
 	MOVO X10, 176(SP)
 	MOVO X11, 192(SP)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
 	MOVO X8, 208(SP)
 	MOVO X9, 224(SP)
 	MOVO X10, 240(SP)
 	MOVO X11, 256(SP)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	MOVOU 32(AX), X10
 	MOVOU 48(AX), X11
 	PXOR  X0, X12
 	PXOR  X1, X15
 	PXOR  X2, X10
 	PXOR  X3, X11
 	PXOR  X4, X12
 	PXOR  X5, X15
 	PXOR  X6, X10
 	PXOR  X7, X11
 	MOVOU X10, 32(AX)
 	MOVOU X11, 48(AX)
 	LEAQ 128(SI), SI
 	SUBQ $128, DI
 	JNE  loop
 	MOVOU X12, 0(AX)
 	MOVOU X15, 16(AX)
 	MOVQ R8, 0(BX)
 	MOVQ R9, 8(BX)
 	MOVQ BP, SP
 	RET
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_generic.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_generic.go
@ -0,0 +1,182 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package blake2b
 import (
 	"encoding/binary"
 	"math/bits"
 )
 // the precomputed values for BLAKE2b
 // there are 12 16-byte arrays - one for each round
 // the entries are calculated from the sigma constants.
 var precomputed = [12][16]byte{
 	{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
 	{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
 	{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
 	{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
 	{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
 	{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
 	{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
 	{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
 	{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
 	{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
 	{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, // equal to the first
 	{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, // equal to the second
 }
 func hashBlocksGeneric(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
 	var m [16]uint64
 	c0, c1 := c[0], c[1]
 	for i := 0; i < len(blocks); {
 		c0 += BlockSize
 		if c0 < BlockSize {
 			c1++
 		}
 		v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
 		v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7]
 		v12 ^= c0
 		v13 ^= c1
 		v14 ^= flag
 		for j := range m {
 			m[j] = binary.LittleEndian.Uint64(blocks[i:])
 			i += 8
 		}
 		for j := range precomputed {
 			s := &(precomputed[j])
 			v0 += m[s[0]]
 			v0 += v4
 			v12 ^= v0
 			v12 = bits.RotateLeft64(v12, -32)
 			v8 += v12
 			v4 ^= v8
 			v4 = bits.RotateLeft64(v4, -24)
 			v1 += m[s[1]]
 			v1 += v5
 			v13 ^= v1
 			v13 = bits.RotateLeft64(v13, -32)
 			v9 += v13
 			v5 ^= v9
 			v5 = bits.RotateLeft64(v5, -24)
 			v2 += m[s[2]]
 			v2 += v6
 			v14 ^= v2
 			v14 = bits.RotateLeft64(v14, -32)
 			v10 += v14
 			v6 ^= v10
 			v6 = bits.RotateLeft64(v6, -24)
 			v3 += m[s[3]]
 			v3 += v7
 			v15 ^= v3
 			v15 = bits.RotateLeft64(v15, -32)
 			v11 += v15
 			v7 ^= v11
 			v7 = bits.RotateLeft64(v7, -24)
 			v0 += m[s[4]]
 			v0 += v4
 			v12 ^= v0
 			v12 = bits.RotateLeft64(v12, -16)
 			v8 += v12
 			v4 ^= v8
 			v4 = bits.RotateLeft64(v4, -63)
 			v1 += m[s[5]]
 			v1 += v5
 			v13 ^= v1
 			v13 = bits.RotateLeft64(v13, -16)
 			v9 += v13
 			v5 ^= v9
 			v5 = bits.RotateLeft64(v5, -63)
 			v2 += m[s[6]]
 			v2 += v6
 			v14 ^= v2
 			v14 = bits.RotateLeft64(v14, -16)
 			v10 += v14
 			v6 ^= v10
 			v6 = bits.RotateLeft64(v6, -63)
 			v3 += m[s[7]]
 			v3 += v7
 			v15 ^= v3
 			v15 = bits.RotateLeft64(v15, -16)
 			v11 += v15
 			v7 ^= v11
 			v7 = bits.RotateLeft64(v7, -63)
 			v0 += m[s[8]]
 			v0 += v5
 			v15 ^= v0
 			v15 = bits.RotateLeft64(v15, -32)
 			v10 += v15
 			v5 ^= v10
 			v5 = bits.RotateLeft64(v5, -24)
 			v1 += m[s[9]]
 			v1 += v6
 			v12 ^= v1
 			v12 = bits.RotateLeft64(v12, -32)
 			v11 += v12
 			v6 ^= v11
 			v6 = bits.RotateLeft64(v6, -24)
 			v2 += m[s[10]]
 			v2 += v7
 			v13 ^= v2
 			v13 = bits.RotateLeft64(v13, -32)
 			v8 += v13
 			v7 ^= v8
 			v7 = bits.RotateLeft64(v7, -24)
 			v3 += m[s[11]]
 			v3 += v4
 			v14 ^= v3
 			v14 = bits.RotateLeft64(v14, -32)
 			v9 += v14
 			v4 ^= v9
 			v4 = bits.RotateLeft64(v4, -24)
 			v0 += m[s[12]]
 			v0 += v5
 			v15 ^= v0
 			v15 = bits.RotateLeft64(v15, -16)
 			v10 += v15
 			v5 ^= v10
 			v5 = bits.RotateLeft64(v5, -63)
 			v1 += m[s[13]]
 			v1 += v6
 			v12 ^= v1
 			v12 = bits.RotateLeft64(v12, -16)
 			v11 += v12
 			v6 ^= v11
 			v6 = bits.RotateLeft64(v6, -63)
 			v2 += m[s[14]]
 			v2 += v7
 			v13 ^= v2
 			v13 = bits.RotateLeft64(v13, -16)
 			v8 += v13
 			v7 ^= v8
 			v7 = bits.RotateLeft64(v7, -63)
 			v3 += m[s[15]]
 			v3 += v4
 			v14 ^= v3
 			v14 = bits.RotateLeft64(v14, -16)
 			v9 += v14
 			v4 ^= v9
 			v4 = bits.RotateLeft64(v4, -63)
 		}
 		h[0] ^= v0 ^ v8
 		h[1] ^= v1 ^ v9
 		h[2] ^= v2 ^ v10
 		h[3] ^= v3 ^ v11
 		h[4] ^= v4 ^ v12
 		h[5] ^= v5 ^ v13
 		h[6] ^= v6 ^ v14
 		h[7] ^= v7 ^ v15
 	}
 	c[0], c[1] = c0, c1
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_ref.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_ref.go
@ -0,0 +1,11 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64 appengine gccgo
 package blake2b
 func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
 	hashBlocksGeneric(h, c, flag, blocks)
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2x.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2x.go
@ -0,0 +1,177 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package blake2b
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 )
 // XOF defines the interface to hash functions that
 // support arbitrary-length output.
 type XOF interface {
 	// Write absorbs more data into the hash's state. It panics if called
 	// after Read.
 	io.Writer
 	// Read reads more output from the hash. It returns io.EOF if the limit
 	// has been reached.
 	io.Reader
 	// Clone returns a copy of the XOF in its current state.
 	Clone() XOF
 	// Reset resets the XOF to its initial state.
 	Reset()
 }
 // OutputLengthUnknown can be used as the size argument to NewXOF to indicate
 // the length of the output is not known in advance.
 const OutputLengthUnknown = 0
 // magicUnknownOutputLength is a magic value for the output size that indicates
 // an unknown number of output bytes.
 const magicUnknownOutputLength = (1 << 32) - 1
 // maxOutputLength is the absolute maximum number of bytes to produce when the
 // number of output bytes is unknown.
 const maxOutputLength = (1 << 32) * 64
 // NewXOF creates a new variable-output-length hash. The hash either produce a
 // known number of bytes (1 <= size < 2**32-1), or an unknown number of bytes
 // (size == OutputLengthUnknown). In the latter case, an absolute limit of
 // 256GiB applies.
 //
 // A non-nil key turns the hash into a MAC. The key must between
 // zero and 32 bytes long.
 func NewXOF(size uint32, key []byte) (XOF, error) {
 	if len(key) > Size {
 		return nil, errKeySize
 	}
 	if size == magicUnknownOutputLength {
 		// 2^32-1 indicates an unknown number of bytes and thus isn't a
 		// valid length.
 		return nil, errors.New("blake2b: XOF length too large")
 	}
 	if size == OutputLengthUnknown {
 		size = magicUnknownOutputLength
 	}
 	x := &xof{
 		d: digest{
 			size:   Size,
 			keyLen: len(key),
 		},
 		length: size,
 	}
 	copy(x.d.key[:], key)
 	x.Reset()
 	return x, nil
 }
 type xof struct {
 	d                digest
 	length           uint32
 	remaining        uint64
 	cfg, root, block [Size]byte
 	offset           int
 	nodeOffset       uint32
 	readMode         bool
 }
 func (x *xof) Write(p []byte) (n int, err error) {
 	if x.readMode {
 		panic("blake2b: write to XOF after read")
 	}
 	return x.d.Write(p)
 }
 func (x *xof) Clone() XOF {
 	clone := *x
 	return &clone
 }
 func (x *xof) Reset() {
 	x.cfg[0] = byte(Size)
 	binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length
 	binary.LittleEndian.PutUint32(x.cfg[12:], x.length)    // XOF length
 	x.cfg[17] = byte(Size)                                 // inner hash size
 	x.d.Reset()
 	x.d.h[1] ^= uint64(x.length) << 32
 	x.remaining = uint64(x.length)
 	if x.remaining == magicUnknownOutputLength {
 		x.remaining = maxOutputLength
 	}
 	x.offset, x.nodeOffset = 0, 0
 	x.readMode = false
 }
 func (x *xof) Read(p []byte) (n int, err error) {
 	if !x.readMode {
 		x.d.finalize(&x.root)
 		x.readMode = true
 	}
 	if x.remaining == 0 {
 		return 0, io.EOF
 	}
 	n = len(p)
 	if uint64(n) > x.remaining {
 		n = int(x.remaining)
 		p = p[:n]
 	}
 	if x.offset > 0 {
 		blockRemaining := Size - x.offset
 		if n < blockRemaining {
 			x.offset += copy(p, x.block[x.offset:])
 			x.remaining -= uint64(n)
 			return
 		}
 		copy(p, x.block[x.offset:])
 		p = p[blockRemaining:]
 		x.offset = 0
 		x.remaining -= uint64(blockRemaining)
 	}
 	for len(p) >= Size {
 		binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
 		x.nodeOffset++
 		x.d.initConfig(&x.cfg)
 		x.d.Write(x.root[:])
 		x.d.finalize(&x.block)
 		copy(p, x.block[:])
 		p = p[Size:]
 		x.remaining -= uint64(Size)
 	}
 	if todo := len(p); todo > 0 {
 		if x.remaining < uint64(Size) {
 			x.cfg[0] = byte(x.remaining)
 		}
 		binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
 		x.nodeOffset++
 		x.d.initConfig(&x.cfg)
 		x.d.Write(x.root[:])
 		x.d.finalize(&x.block)
 		x.offset = copy(p, x.block[:todo])
 		x.remaining -= uint64(todo)
 	}
 	return
 }
 func (d *digest) initConfig(cfg *[Size]byte) {
 	d.offset, d.c[0], d.c[1] = 0, 0, 0
 	for i := range d.h {
 		d.h[i] = iv[i] ^ binary.LittleEndian.Uint64(cfg[i*8:])
 	}
 }
--- a/vendor/golang.org/x/crypto/blake2b/register.go
+++ b/vendor/golang.org/x/crypto/blake2b/register.go
@ -0,0 +1,32 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.9
 package blake2b
 import (
 	"crypto"
 	"hash"
 )
 func init() {
 	newHash256 := func() hash.Hash {
 		h, _ := New256(nil)
 		return h
 	}
 	newHash384 := func() hash.Hash {
 		h, _ := New384(nil)
 		return h
 	}
 	newHash512 := func() hash.Hash {
 		h, _ := New512(nil)
 		return h
 	}
 	crypto.RegisterHash(crypto.BLAKE2b_256, newHash256)
 	crypto.RegisterHash(crypto.BLAKE2b_384, newHash384)
 	crypto.RegisterHash(crypto.BLAKE2b_512, newHash512)
 }
--- a/vendor/golang.org/x/crypto/scrypt/scrypt.go
+++ b/vendor/golang.org/x/crypto/scrypt/scrypt.go
@ -0,0 +1,213 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package scrypt implements the scrypt key derivation function as defined in
 // Colin Percival's paper "Stronger Key Derivation via Sequential Memory-Hard
 // Functions" (https://www.tarsnap.com/scrypt/scrypt.pdf).
 package scrypt // import "golang.org/x/crypto/scrypt"
 import (
 	"crypto/sha256"
 	"errors"
 	"math/bits"
 	"golang.org/x/crypto/pbkdf2"
 )
 const maxInt = int(^uint(0) >> 1)
 // blockCopy copies n numbers from src into dst.
 func blockCopy(dst, src []uint32, n int) {
 	copy(dst, src[:n])
 }
 // blockXOR XORs numbers from dst with n numbers from src.
 func blockXOR(dst, src []uint32, n int) {
 	for i, v := range src[:n] {
 		dst[i] ^= v
 	}
 }
 // salsaXOR applies Salsa20/8 to the XOR of 16 numbers from tmp and in,
 // and puts the result into both tmp and out.
 func salsaXOR(tmp *[16]uint32, in, out []uint32) {
 	w0 := tmp[0] ^ in[0]
 	w1 := tmp[1] ^ in[1]
 	w2 := tmp[2] ^ in[2]
 	w3 := tmp[3] ^ in[3]
 	w4 := tmp[4] ^ in[4]
 	w5 := tmp[5] ^ in[5]
 	w6 := tmp[6] ^ in[6]
 	w7 := tmp[7] ^ in[7]
 	w8 := tmp[8] ^ in[8]
 	w9 := tmp[9] ^ in[9]
 	w10 := tmp[10] ^ in[10]
 	w11 := tmp[11] ^ in[11]
 	w12 := tmp[12] ^ in[12]
 	w13 := tmp[13] ^ in[13]
 	w14 := tmp[14] ^ in[14]
 	w15 := tmp[15] ^ in[15]
 	x0, x1, x2, x3, x4, x5, x6, x7, x8 := w0, w1, w2, w3, w4, w5, w6, w7, w8
 	x9, x10, x11, x12, x13, x14, x15 := w9, w10, w11, w12, w13, w14, w15
 	for i := 0; i < 8; i += 2 {
 		x4 ^= bits.RotateLeft32(x0+x12, 7)
 		x8 ^= bits.RotateLeft32(x4+x0, 9)
 		x12 ^= bits.RotateLeft32(x8+x4, 13)
 		x0 ^= bits.RotateLeft32(x12+x8, 18)
 		x9 ^= bits.RotateLeft32(x5+x1, 7)
 		x13 ^= bits.RotateLeft32(x9+x5, 9)
 		x1 ^= bits.RotateLeft32(x13+x9, 13)
 		x5 ^= bits.RotateLeft32(x1+x13, 18)
 		x14 ^= bits.RotateLeft32(x10+x6, 7)
 		x2 ^= bits.RotateLeft32(x14+x10, 9)
 		x6 ^= bits.RotateLeft32(x2+x14, 13)
 		x10 ^= bits.RotateLeft32(x6+x2, 18)
 		x3 ^= bits.RotateLeft32(x15+x11, 7)
 		x7 ^= bits.RotateLeft32(x3+x15, 9)
 		x11 ^= bits.RotateLeft32(x7+x3, 13)
 		x15 ^= bits.RotateLeft32(x11+x7, 18)
 		x1 ^= bits.RotateLeft32(x0+x3, 7)
 		x2 ^= bits.RotateLeft32(x1+x0, 9)
 		x3 ^= bits.RotateLeft32(x2+x1, 13)
 		x0 ^= bits.RotateLeft32(x3+x2, 18)
 		x6 ^= bits.RotateLeft32(x5+x4, 7)
 		x7 ^= bits.RotateLeft32(x6+x5, 9)
 		x4 ^= bits.RotateLeft32(x7+x6, 13)
 		x5 ^= bits.RotateLeft32(x4+x7, 18)
 		x11 ^= bits.RotateLeft32(x10+x9, 7)
 		x8 ^= bits.RotateLeft32(x11+x10, 9)
 		x9 ^= bits.RotateLeft32(x8+x11, 13)
 		x10 ^= bits.RotateLeft32(x9+x8, 18)
 		x12 ^= bits.RotateLeft32(x15+x14, 7)
 		x13 ^= bits.RotateLeft32(x12+x15, 9)
 		x14 ^= bits.RotateLeft32(x13+x12, 13)
 		x15 ^= bits.RotateLeft32(x14+x13, 18)
 	}
 	x0 += w0
 	x1 += w1
 	x2 += w2
 	x3 += w3
 	x4 += w4
 	x5 += w5
 	x6 += w6
 	x7 += w7
 	x8 += w8
 	x9 += w9
 	x10 += w10
 	x11 += w11
 	x12 += w12
 	x13 += w13
 	x14 += w14
 	x15 += w15
 	out[0], tmp[0] = x0, x0
 	out[1], tmp[1] = x1, x1
 	out[2], tmp[2] = x2, x2
 	out[3], tmp[3] = x3, x3
 	out[4], tmp[4] = x4, x4
 	out[5], tmp[5] = x5, x5
 	out[6], tmp[6] = x6, x6
 	out[7], tmp[7] = x7, x7
 	out[8], tmp[8] = x8, x8
 	out[9], tmp[9] = x9, x9
 	out[10], tmp[10] = x10, x10
 	out[11], tmp[11] = x11, x11
 	out[12], tmp[12] = x12, x12
 	out[13], tmp[13] = x13, x13
 	out[14], tmp[14] = x14, x14
 	out[15], tmp[15] = x15, x15
 }
 func blockMix(tmp *[16]uint32, in, out []uint32, r int) {
 	blockCopy(tmp[:], in[(2*r-1)*16:], 16)
 	for i := 0; i < 2*r; i += 2 {
 		salsaXOR(tmp, in[i*16:], out[i*8:])
 		salsaXOR(tmp, in[i*16+16:], out[i*8+r*16:])
 	}
 }
 func integer(b []uint32, r int) uint64 {
 	j := (2*r - 1) * 16
 	return uint64(b[j]) | uint64(b[j+1])<<32
 }
 func smix(b []byte, r, N int, v, xy []uint32) {
 	var tmp [16]uint32
 	x := xy
 	y := xy[32*r:]
 	j := 0
 	for i := 0; i < 32*r; i++ {
 		x[i] = uint32(b[j]) | uint32(b[j+1])<<8 | uint32(b[j+2])<<16 | uint32(b[j+3])<<24
 		j += 4
 	}
 	for i := 0; i < N; i += 2 {
 		blockCopy(v[i*(32*r):], x, 32*r)
 		blockMix(&tmp, x, y, r)
 		blockCopy(v[(i+1)*(32*r):], y, 32*r)
 		blockMix(&tmp, y, x, r)
 	}
 	for i := 0; i < N; i += 2 {
 		j := int(integer(x, r) & uint64(N-1))
 		blockXOR(x, v[j*(32*r):], 32*r)
 		blockMix(&tmp, x, y, r)
 		j = int(integer(y, r) & uint64(N-1))
 		blockXOR(y, v[j*(32*r):], 32*r)
 		blockMix(&tmp, y, x, r)
 	}
 	j = 0
 	for _, v := range x[:32*r] {
 		b[j+0] = byte(v >> 0)
 		b[j+1] = byte(v >> 8)
 		b[j+2] = byte(v >> 16)
 		b[j+3] = byte(v >> 24)
 		j += 4
 	}
 }
 // Key derives a key from the password, salt, and cost parameters, returning
 // a byte slice of length keyLen that can be used as cryptographic key.
 //
 // N is a CPU/memory cost parameter, which must be a power of two greater than 1.
 // r and p must satisfy r * p < 2³⁰. If the parameters do not satisfy the
 // limits, the function returns a nil byte slice and an error.
 //
 // For example, you can get a derived key for e.g. AES-256 (which needs a
 // 32-byte key) by doing:
 //
 //      dk, err := scrypt.Key([]byte("some password"), salt, 32768, 8, 1, 32)
 //
 // The recommended parameters for interactive logins as of 2017 are N=32768, r=8
 // and p=1. The parameters N, r, and p should be increased as memory latency and
 // CPU parallelism increases; consider setting N to the highest power of 2 you
 // can derive within 100 milliseconds. Remember to get a good random salt.
 func Key(password, salt []byte, N, r, p, keyLen int) ([]byte, error) {
 	if N <= 1 || N&(N-1) != 0 {
 		return nil, errors.New("scrypt: N must be > 1 and a power of 2")
 	}
 	if uint64(r)*uint64(p) >= 1<<30 || r > maxInt/128/p || r > maxInt/256 || N > maxInt/128/r {
 		return nil, errors.New("scrypt: parameters are too large")
 	}
 	xy := make([]uint32, 64*r)
 	v := make([]uint32, 32*N*r)
 	b := pbkdf2.Key(password, salt, 1, p*128*r, sha256.New)
 	for i := 0; i < p; i++ {
 		smix(b[i*128*r:], r, N, v, xy)
 	}
 	return pbkdf2.Key(password, b, 1, keyLen, sha256.New), nil
 }
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -336,12 +336,15 @@ github.com/xanzy/ssh-agent
 github.com/yohcop/openid-go
 # golang.org/x/crypto v0.0.0-20190618222545-ea8f1a30c443
 golang.org/x/crypto/acme/autocert
 golang.org/x/crypto/argon2
 golang.org/x/crypto/bcrypt
 golang.org/x/crypto/pbkdf2
 golang.org/x/crypto/scrypt
 golang.org/x/crypto/ssh
 golang.org/x/crypto/acme
 golang.org/x/crypto/openpgp
 golang.org/x/crypto/md4
 golang.org/x/crypto/blake2b
 golang.org/x/crypto/blowfish
 golang.org/x/crypto/curve25519
 golang.org/x/crypto/ed25519
@ -372,8 +375,8 @@ golang.org/x/oauth2/internal
 # golang.org/x/sys v0.0.0-20190620070143-6f217b454f45
 golang.org/x/sys/windows
 golang.org/x/sys/windows/svc
 golang.org/x/sys/unix
 golang.org/x/sys/cpu
 golang.org/x/sys/unix
 # golang.org/x/text v0.3.2
 golang.org/x/text/transform
 golang.org/x/text/encoding