diff --git a/crypto/chacha.go b/crypto/chacha.go index 88f8661..cde97c6 100644 --- a/crypto/chacha.go +++ b/crypto/chacha.go @@ -5,7 +5,7 @@ import ( "crypto/sha512" "encoding/base64" - "github.com/aead/chacha20" + "golang.org/x/crypto/chacha20" ) // ChaChaStream is a ChaCha20 cipher that implements Stream and Encrypter interface @@ -15,7 +15,7 @@ type ChaChaStream struct { // NewChaChaEncrypter initialize a new ChaChaStream interfaced with Encrypter func NewChaChaEncrypter(key []byte, iv []byte) (*ChaChaStream, error) { - cipher, err := chacha20.NewCipher(iv, key) + cipher, err := chacha20.NewUnauthenticatedCipher(key, iv) if err != nil { return nil, err } @@ -30,7 +30,7 @@ func NewChaChaEncrypter(key []byte, iv []byte) (*ChaChaStream, error) { func NewChaChaStream(key []byte) (*ChaChaStream, error) { hash := sha512.Sum512(key) - cipher, err := chacha20.NewCipher(hash[32:44], hash[:32]) + cipher, err := chacha20.NewUnauthenticatedCipher(hash[:32], hash[32:44]) if err != nil { return nil, err } diff --git a/crypto_test.go b/crypto_test.go index 61e121a..9083b5e 100644 --- a/crypto_test.go +++ b/crypto_test.go @@ -26,7 +26,10 @@ func TestChaCha(t *testing.T) { payload := []byte("test message") // Encrypt - c, _ := NewStreamManager(ChaChaStreamID, key) + c, err := NewStreamManager(ChaChaStreamID, key) + if err != nil { + t.Fatalf("Failed to create stream manager: %v", err) + } crypted := c.Pack(payload) // Decrypt diff --git a/go.mod b/go.mod index 3d65546..2d3aaff 100644 --- a/go.mod +++ b/go.mod @@ -4,16 +4,15 @@ go 1.20 require ( github.com/aead/argon2 v0.0.0-20180111183520-a87724528b07 - github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da github.com/google/go-cmp v0.6.0 github.com/stretchr/testify v1.8.4 + golang.org/x/crypto v0.17.0 golang.org/x/exp v0.0.0-20230105202349-8879d0199aa3 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - golang.org/x/crypto v0.17.0 // indirect golang.org/x/sys v0.15.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index cf2c77d..14c2078 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ github.com/aead/argon2 v0.0.0-20180111183520-a87724528b07 h1:i9/M2RadeVsPBMNwXFiaYkXQi9lY9VuZeI4Onavd3pA= github.com/aead/argon2 v0.0.0-20180111183520-a87724528b07/go.mod h1:Tnm/osX+XXr9R+S71o5/F0E60sRkPVALdhWw25qPImQ= -github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da h1:KjTM2ks9d14ZYCvmHS9iAKVt9AyzRSqNU1qabPih5BY= -github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da/go.mod h1:eHEWzANqSiWQsof+nXEI9bUVUyV6F53Fp89EuCh2EAA= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= @@ -10,14 +8,10 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -golang.org/x/crypto v0.8.0 h1:pd9TJtTueMTVQXzk8E2XESSMQDj/U7OUu0PqJqPXQjQ= -golang.org/x/crypto v0.8.0/go.mod h1:mRqEX+O9/h5TFCrQhkgjo2yKi0yYA+9ecGkdQoHrywE= golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/exp v0.0.0-20230105202349-8879d0199aa3 h1:fJwx88sMf5RXwDwziL0/Mn9Wqs+efMSo/RYcL+37W9c= golang.org/x/exp v0.0.0-20230105202349-8879d0199aa3/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= -golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= -golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= diff --git a/vendor/github.com/aead/chacha20/.gitignore b/vendor/github.com/aead/chacha20/.gitignore deleted file mode 100644 index 9d3d843..0000000 --- a/vendor/github.com/aead/chacha20/.gitignore +++ /dev/null @@ -1,25 +0,0 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so - -# Folders -_obj -_test -.vscode - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe -*.test -*.prof diff --git a/vendor/github.com/aead/chacha20/.travis.yml b/vendor/github.com/aead/chacha20/.travis.yml deleted file mode 100644 index 99199e0..0000000 --- a/vendor/github.com/aead/chacha20/.travis.yml +++ /dev/null @@ -1,25 +0,0 @@ -language: go - -go: - - "1.8.x" - - "1.9.x" - - "1.10.x" - -env: - - TRAVIS_GOARCH=amd64 - - TRAVIS_GOARCH=386 - -before_install: -- export GOARCH=$TRAVIS_GOARCH - -branches: - only: - - master - -before_script: -- go get -u github.com/klauspost/asmfmt/cmd/asmfmt - -script: -- diff -au <(gofmt -d .) <(printf "") -- diff -au <(asmfmt -d .) <(printf "") -- go test -v ./... diff --git a/vendor/github.com/aead/chacha20/LICENSE b/vendor/github.com/aead/chacha20/LICENSE deleted file mode 100644 index b6a9210..0000000 --- a/vendor/github.com/aead/chacha20/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2016 Andreas Auernhammer - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/vendor/github.com/aead/chacha20/README.md b/vendor/github.com/aead/chacha20/README.md deleted file mode 100644 index b369424..0000000 --- a/vendor/github.com/aead/chacha20/README.md +++ /dev/null @@ -1,82 +0,0 @@ -[![Godoc Reference](https://godoc.org/github.com/aead/chacha20?status.svg)](https://godoc.org/github.com/aead/chacha20) -[![Build Status](https://travis-ci.org/aead/chacha20.svg?branch=master)](https://travis-ci.org/aead/chacha20) -[![Go Report Card](https://goreportcard.com/badge/aead/chacha20)](https://goreportcard.com/report/aead/chacha20) - -## The ChaCha20 stream cipher - -ChaCha is a stream cipher family created by Daniel J. Bernstein. -The most common ChaCha variant is ChaCha20 (20 rounds). ChaCha20 is -standardized in [RFC 7539](https://tools.ietf.org/html/rfc7539 "RFC 7539"). - -This package provides implementations of three ChaCha versions: -- ChaCha20 with a 64 bit nonce (can en/decrypt up to 2^64 * 64 bytes for one key-nonce combination) -- ChaCha20 with a 96 bit nonce (can en/decrypt up to 2^32 * 64 bytes ~ 256 GB for one key-nonce combination) -- XChaCha20 with a 192 bit nonce (can en/decrypt up to 2^64 * 64 bytes for one key-nonce combination) - -Furthermore the chacha sub package implements ChaCha20/12 and ChaCha20/8. -These versions use 12 or 8 rounds instead of 20. -But it's recommended to use ChaCha20 (with 20 rounds) - it will be fast enough for almost all purposes. - -### Installation -Install in your GOPATH: `go get -u github.com/aead/chacha20` - -### Requirements -All go versions >= 1.8.7 are supported. -The code may also work on Go 1.7 but this is not tested. - -### Performance - -#### AMD64 -Hardware: Intel i7-6500U 2.50GHz x 2 -System: Linux Ubuntu 16.04 - kernel: 4.4.0-62-generic -Go version: 1.8.0 -``` -AVX2 -name speed cpb -ChaCha20_64-4 573MB/s ± 0% 4.16 -ChaCha20_1K-4 2.19GB/s ± 0% 1.06 -XChaCha20_64-4 261MB/s ± 0% 9.13 -XChaCha20_1K-4 1.69GB/s ± 4% 1.37 -XORKeyStream64-4 474MB/s ± 2% 5.02 -XORKeyStream1K-4 2.09GB/s ± 1% 1.11 -XChaCha20_XORKeyStream64-4 262MB/s ± 0% 9.09 -XChaCha20_XORKeyStream1K-4 1.71GB/s ± 1% 1.36 - -SSSE3 -name speed cpb -ChaCha20_64-4 583MB/s ± 0% 4.08 -ChaCha20_1K-4 1.15GB/s ± 1% 2.02 -XChaCha20_64-4 267MB/s ± 0% 8.92 -XChaCha20_1K-4 984MB/s ± 5% 2.42 -XORKeyStream64-4 492MB/s ± 1% 4.84 -XORKeyStream1K-4 1.10GB/s ± 5% 2.11 -XChaCha20_XORKeyStream64-4 266MB/s ± 0% 8.96 -XChaCha20_XORKeyStream1K-4 1.00GB/s ± 2% 2.32 -``` -#### 386 -Hardware: Intel i7-6500U 2.50GHz x 2 -System: Linux Ubuntu 16.04 - kernel: 4.4.0-62-generic -Go version: 1.8.0 -``` -SSSE3 -name                        speed cpb -ChaCha20_64-4               570MB/s ± 0% 4.18 -ChaCha20_1K-4               650MB/s ± 0% 3.66 -XChaCha20_64-4              223MB/s ± 0% 10.69 -XChaCha20_1K-4              584MB/s ± 1% 4.08 -XORKeyStream64-4            392MB/s ± 1% 6.08 -XORKeyStream1K-4            629MB/s ± 1% 3.79 -XChaCha20_XORKeyStream64-4  222MB/s ± 0% 10.73 -XChaCha20_XORKeyStream1K-4  585MB/s ± 0% 4.07 - -SSE2 -name speed cpb -ChaCha20_64-4 509MB/s ± 0% 4.68 -ChaCha20_1K-4 553MB/s ± 2% 4.31 -XChaCha20_64-4 201MB/s ± 0% 11.86 -XChaCha20_1K-4 498MB/s ± 4% 4.78 -XORKeyStream64-4 359MB/s ± 1% 6.64 -XORKeyStream1K-4 545MB/s ± 0% 4.37 -XChaCha20_XORKeyStream64-4 201MB/s ± 1% 11.86 -XChaCha20_XORKeyStream1K-4 507MB/s ± 0% 4.70 -``` diff --git a/vendor/github.com/aead/chacha20/chacha/chacha.go b/vendor/github.com/aead/chacha20/chacha/chacha.go deleted file mode 100644 index c2b39da..0000000 --- a/vendor/github.com/aead/chacha20/chacha/chacha.go +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// Package chacha implements some low-level functions of the -// ChaCha cipher family. -package chacha // import "github.com/aead/chacha20/chacha" - -import ( - "encoding/binary" - "errors" - "math" -) - -const ( - // NonceSize is the size of the ChaCha20 nonce in bytes. - NonceSize = 8 - - // INonceSize is the size of the IETF-ChaCha20 nonce in bytes. - INonceSize = 12 - - // XNonceSize is the size of the XChaCha20 nonce in bytes. - XNonceSize = 24 - - // KeySize is the size of the key in bytes. - KeySize = 32 -) - -var ( - useSSE2 bool - useSSSE3 bool - useAVX bool - useAVX2 bool -) - -var ( - errKeySize = errors.New("chacha20/chacha: bad key length") - errInvalidNonce = errors.New("chacha20/chacha: bad nonce length") -) - -func setup(state *[64]byte, nonce, key []byte) (err error) { - if len(key) != KeySize { - err = errKeySize - return - } - var Nonce [16]byte - switch len(nonce) { - case NonceSize: - copy(Nonce[8:], nonce) - initialize(state, key, &Nonce) - case INonceSize: - copy(Nonce[4:], nonce) - initialize(state, key, &Nonce) - case XNonceSize: - var tmpKey [32]byte - var hNonce [16]byte - - copy(hNonce[:], nonce[:16]) - copy(tmpKey[:], key) - HChaCha20(&tmpKey, &hNonce, &tmpKey) - copy(Nonce[8:], nonce[16:]) - initialize(state, tmpKey[:], &Nonce) - - // BUG(aead): A "good" compiler will remove this (optimizations) - // But using the provided key instead of tmpKey, - // will change the key (-> probably confuses users) - for i := range tmpKey { - tmpKey[i] = 0 - } - default: - err = errInvalidNonce - } - return -} - -// XORKeyStream crypts bytes from src to dst using the given nonce and key. -// The length of the nonce determinds the version of ChaCha20: -// - NonceSize: ChaCha20/r with a 64 bit nonce and a 2^64 * 64 byte period. -// - INonceSize: ChaCha20/r as defined in RFC 7539 and a 2^32 * 64 byte period. -// - XNonceSize: XChaCha20/r with a 192 bit nonce and a 2^64 * 64 byte period. -// The rounds argument specifies the number of rounds performed for keystream -// generation - valid values are 8, 12 or 20. The src and dst may be the same slice -// but otherwise should not overlap. If len(dst) < len(src) this function panics. -// If the nonce is neither 64, 96 nor 192 bits long, this function panics. -func XORKeyStream(dst, src, nonce, key []byte, rounds int) { - if rounds != 20 && rounds != 12 && rounds != 8 { - panic("chacha20/chacha: bad number of rounds") - } - if len(dst) < len(src) { - panic("chacha20/chacha: dst buffer is to small") - } - if len(nonce) == INonceSize && uint64(len(src)) > (1<<38) { - panic("chacha20/chacha: src is too large") - } - - var block, state [64]byte - if err := setup(&state, nonce, key); err != nil { - panic(err) - } - xorKeyStream(dst, src, &block, &state, rounds) -} - -// Cipher implements ChaCha20/r (XChaCha20/r) for a given number of rounds r. -type Cipher struct { - state, block [64]byte - off int - rounds int // 20 for ChaCha20 - noncesize int -} - -// NewCipher returns a new *chacha.Cipher implementing the ChaCha20/r or XChaCha20/r -// (r = 8, 12 or 20) stream cipher. The nonce must be unique for one key for all time. -// The length of the nonce determinds the version of ChaCha20: -// - NonceSize: ChaCha20/r with a 64 bit nonce and a 2^64 * 64 byte period. -// - INonceSize: ChaCha20/r as defined in RFC 7539 and a 2^32 * 64 byte period. -// - XNonceSize: XChaCha20/r with a 192 bit nonce and a 2^64 * 64 byte period. -// If the nonce is neither 64, 96 nor 192 bits long, a non-nil error is returned. -func NewCipher(nonce, key []byte, rounds int) (*Cipher, error) { - if rounds != 20 && rounds != 12 && rounds != 8 { - panic("chacha20/chacha: bad number of rounds") - } - - c := new(Cipher) - if err := setup(&(c.state), nonce, key); err != nil { - return nil, err - } - c.rounds = rounds - - if len(nonce) == INonceSize { - c.noncesize = INonceSize - } else { - c.noncesize = NonceSize - } - - return c, nil -} - -// XORKeyStream crypts bytes from src to dst. Src and dst may be the same slice -// but otherwise should not overlap. If len(dst) < len(src) the function panics. -func (c *Cipher) XORKeyStream(dst, src []byte) { - if len(dst) < len(src) { - panic("chacha20/chacha: dst buffer is to small") - } - - if c.off > 0 { - n := len(c.block[c.off:]) - if len(src) <= n { - for i, v := range src { - dst[i] = v ^ c.block[c.off] - c.off++ - } - if c.off == 64 { - c.off = 0 - } - return - } - - for i, v := range c.block[c.off:] { - dst[i] = src[i] ^ v - } - src = src[n:] - dst = dst[n:] - c.off = 0 - } - - // check for counter overflow - blocksToXOR := len(src) / 64 - if len(src)%64 != 0 { - blocksToXOR++ - } - var overflow bool - if c.noncesize == INonceSize { - overflow = binary.LittleEndian.Uint32(c.state[48:]) > math.MaxUint32-uint32(blocksToXOR) - } else { - overflow = binary.LittleEndian.Uint64(c.state[48:]) > math.MaxUint64-uint64(blocksToXOR) - } - if overflow { - panic("chacha20/chacha: counter overflow") - } - - c.off += xorKeyStream(dst, src, &(c.block), &(c.state), c.rounds) -} - -// SetCounter skips ctr * 64 byte blocks. SetCounter(0) resets the cipher. -// This function always skips the unused keystream of the current 64 byte block. -func (c *Cipher) SetCounter(ctr uint64) { - if c.noncesize == INonceSize { - binary.LittleEndian.PutUint32(c.state[48:], uint32(ctr)) - } else { - binary.LittleEndian.PutUint64(c.state[48:], ctr) - } - c.off = 0 -} - -// HChaCha20 generates 32 pseudo-random bytes from a 128 bit nonce and a 256 bit secret key. -// It can be used as a key-derivation-function (KDF). -func HChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { hChaCha20(out, nonce, key) } diff --git a/vendor/github.com/aead/chacha20/chacha/chachaAVX2_amd64.s b/vendor/github.com/aead/chacha20/chacha/chachaAVX2_amd64.s deleted file mode 100644 index c2b5f52..0000000 --- a/vendor/github.com/aead/chacha20/chacha/chachaAVX2_amd64.s +++ /dev/null @@ -1,406 +0,0 @@ -// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// +build amd64,!gccgo,!appengine,!nacl - -#include "const.s" -#include "macro.s" - -#define TWO 0(SP) -#define C16 32(SP) -#define C8 64(SP) -#define STATE_0 96(SP) -#define STATE_1 128(SP) -#define STATE_2 160(SP) -#define STATE_3 192(SP) -#define TMP_0 224(SP) -#define TMP_1 256(SP) - -// func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int -TEXT ·xorKeyStreamAVX2(SB), 4, $320-80 - MOVQ dst_base+0(FP), DI - MOVQ src_base+24(FP), SI - MOVQ block+48(FP), BX - MOVQ state+56(FP), AX - MOVQ rounds+64(FP), DX - MOVQ src_len+32(FP), CX - - MOVQ SP, R8 - ADDQ $32, SP - ANDQ $-32, SP - - VMOVDQU 0(AX), Y2 - VMOVDQU 32(AX), Y3 - VPERM2I128 $0x22, Y2, Y0, Y0 - VPERM2I128 $0x33, Y2, Y1, Y1 - VPERM2I128 $0x22, Y3, Y2, Y2 - VPERM2I128 $0x33, Y3, Y3, Y3 - - TESTQ CX, CX - JZ done - - VMOVDQU ·one_AVX2<>(SB), Y4 - VPADDD Y4, Y3, Y3 - - VMOVDQA Y0, STATE_0 - VMOVDQA Y1, STATE_1 - VMOVDQA Y2, STATE_2 - VMOVDQA Y3, STATE_3 - - VMOVDQU ·rol16_AVX2<>(SB), Y4 - VMOVDQU ·rol8_AVX2<>(SB), Y5 - VMOVDQU ·two_AVX2<>(SB), Y6 - VMOVDQA Y4, Y14 - VMOVDQA Y5, Y15 - VMOVDQA Y4, C16 - VMOVDQA Y5, C8 - VMOVDQA Y6, TWO - - CMPQ CX, $64 - JBE between_0_and_64 - CMPQ CX, $192 - JBE between_64_and_192 - CMPQ CX, $320 - JBE between_192_and_320 - CMPQ CX, $448 - JBE between_320_and_448 - -at_least_512: - VMOVDQA Y0, Y4 - VMOVDQA Y1, Y5 - VMOVDQA Y2, Y6 - VPADDQ TWO, Y3, Y7 - VMOVDQA Y0, Y8 - VMOVDQA Y1, Y9 - VMOVDQA Y2, Y10 - VPADDQ TWO, Y7, Y11 - VMOVDQA Y0, Y12 - VMOVDQA Y1, Y13 - VMOVDQA Y2, Y14 - VPADDQ TWO, Y11, Y15 - - MOVQ DX, R9 - -chacha_loop_512: - VMOVDQA Y8, TMP_0 - CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8) - CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8) - VMOVDQA TMP_0, Y8 - VMOVDQA Y0, TMP_0 - CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8) - CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8) - CHACHA_SHUFFLE_AVX(Y1, Y2, Y3) - CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) - CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) - CHACHA_SHUFFLE_AVX(Y13, Y14, Y15) - - CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8) - CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8) - VMOVDQA TMP_0, Y0 - VMOVDQA Y8, TMP_0 - CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8) - CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8) - VMOVDQA TMP_0, Y8 - CHACHA_SHUFFLE_AVX(Y3, Y2, Y1) - CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) - CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) - CHACHA_SHUFFLE_AVX(Y15, Y14, Y13) - SUBQ $2, R9 - JA chacha_loop_512 - - VMOVDQA Y12, TMP_0 - VMOVDQA Y13, TMP_1 - VPADDD STATE_0, Y0, Y0 - VPADDD STATE_1, Y1, Y1 - VPADDD STATE_2, Y2, Y2 - VPADDD STATE_3, Y3, Y3 - XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13) - VMOVDQA STATE_0, Y0 - VMOVDQA STATE_1, Y1 - VMOVDQA STATE_2, Y2 - VMOVDQA STATE_3, Y3 - VPADDQ TWO, Y3, Y3 - - VPADDD Y0, Y4, Y4 - VPADDD Y1, Y5, Y5 - VPADDD Y2, Y6, Y6 - VPADDD Y3, Y7, Y7 - XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13) - VPADDQ TWO, Y3, Y3 - - VPADDD Y0, Y8, Y8 - VPADDD Y1, Y9, Y9 - VPADDD Y2, Y10, Y10 - VPADDD Y3, Y11, Y11 - XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) - VPADDQ TWO, Y3, Y3 - - VPADDD TMP_0, Y0, Y12 - VPADDD TMP_1, Y1, Y13 - VPADDD Y2, Y14, Y14 - VPADDD Y3, Y15, Y15 - VPADDQ TWO, Y3, Y3 - - CMPQ CX, $512 - JB less_than_512 - - XOR_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5) - VMOVDQA Y3, STATE_3 - ADDQ $512, SI - ADDQ $512, DI - SUBQ $512, CX - CMPQ CX, $448 - JA at_least_512 - - TESTQ CX, CX - JZ done - - VMOVDQA C16, Y14 - VMOVDQA C8, Y15 - - CMPQ CX, $64 - JBE between_0_and_64 - CMPQ CX, $192 - JBE between_64_and_192 - CMPQ CX, $320 - JBE between_192_and_320 - JMP between_320_and_448 - -less_than_512: - XOR_UPPER_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5) - EXTRACT_LOWER(BX, Y12, Y13, Y14, Y15, Y4) - ADDQ $448, SI - ADDQ $448, DI - SUBQ $448, CX - JMP finalize - -between_320_and_448: - VMOVDQA Y0, Y4 - VMOVDQA Y1, Y5 - VMOVDQA Y2, Y6 - VPADDQ TWO, Y3, Y7 - VMOVDQA Y0, Y8 - VMOVDQA Y1, Y9 - VMOVDQA Y2, Y10 - VPADDQ TWO, Y7, Y11 - - MOVQ DX, R9 - -chacha_loop_384: - CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15) - CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) - CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) - CHACHA_SHUFFLE_AVX(Y1, Y2, Y3) - CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) - CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) - CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15) - CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) - CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) - CHACHA_SHUFFLE_AVX(Y3, Y2, Y1) - CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) - CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) - SUBQ $2, R9 - JA chacha_loop_384 - - VPADDD STATE_0, Y0, Y0 - VPADDD STATE_1, Y1, Y1 - VPADDD STATE_2, Y2, Y2 - VPADDD STATE_3, Y3, Y3 - XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13) - VMOVDQA STATE_0, Y0 - VMOVDQA STATE_1, Y1 - VMOVDQA STATE_2, Y2 - VMOVDQA STATE_3, Y3 - VPADDQ TWO, Y3, Y3 - - VPADDD Y0, Y4, Y4 - VPADDD Y1, Y5, Y5 - VPADDD Y2, Y6, Y6 - VPADDD Y3, Y7, Y7 - XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13) - VPADDQ TWO, Y3, Y3 - - VPADDD Y0, Y8, Y8 - VPADDD Y1, Y9, Y9 - VPADDD Y2, Y10, Y10 - VPADDD Y3, Y11, Y11 - VPADDQ TWO, Y3, Y3 - - CMPQ CX, $384 - JB less_than_384 - - XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) - SUBQ $384, CX - TESTQ CX, CX - JE done - - ADDQ $384, SI - ADDQ $384, DI - JMP between_0_and_64 - -less_than_384: - XOR_UPPER_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) - EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12) - ADDQ $320, SI - ADDQ $320, DI - SUBQ $320, CX - JMP finalize - -between_192_and_320: - VMOVDQA Y0, Y4 - VMOVDQA Y1, Y5 - VMOVDQA Y2, Y6 - VMOVDQA Y3, Y7 - VMOVDQA Y0, Y8 - VMOVDQA Y1, Y9 - VMOVDQA Y2, Y10 - VPADDQ TWO, Y3, Y11 - - MOVQ DX, R9 - -chacha_loop_256: - CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) - CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) - CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) - CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) - CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) - CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) - CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) - CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) - SUBQ $2, R9 - JA chacha_loop_256 - - VPADDD Y0, Y4, Y4 - VPADDD Y1, Y5, Y5 - VPADDD Y2, Y6, Y6 - VPADDD Y3, Y7, Y7 - VPADDQ TWO, Y3, Y3 - XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) - VPADDD Y0, Y8, Y8 - VPADDD Y1, Y9, Y9 - VPADDD Y2, Y10, Y10 - VPADDD Y3, Y11, Y11 - VPADDQ TWO, Y3, Y3 - - CMPQ CX, $256 - JB less_than_256 - - XOR_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13) - SUBQ $256, CX - TESTQ CX, CX - JE done - - ADDQ $256, SI - ADDQ $256, DI - JMP between_0_and_64 - -less_than_256: - XOR_UPPER_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13) - EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12) - ADDQ $192, SI - ADDQ $192, DI - SUBQ $192, CX - JMP finalize - -between_64_and_192: - VMOVDQA Y0, Y4 - VMOVDQA Y1, Y5 - VMOVDQA Y2, Y6 - VMOVDQA Y3, Y7 - - MOVQ DX, R9 - -chacha_loop_128: - CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) - CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) - CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) - CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) - SUBQ $2, R9 - JA chacha_loop_128 - - VPADDD Y0, Y4, Y4 - VPADDD Y1, Y5, Y5 - VPADDD Y2, Y6, Y6 - VPADDD Y3, Y7, Y7 - VPADDQ TWO, Y3, Y3 - - CMPQ CX, $128 - JB less_than_128 - - XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) - SUBQ $128, CX - TESTQ CX, CX - JE done - - ADDQ $128, SI - ADDQ $128, DI - JMP between_0_and_64 - -less_than_128: - XOR_UPPER_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) - EXTRACT_LOWER(BX, Y4, Y5, Y6, Y7, Y13) - ADDQ $64, SI - ADDQ $64, DI - SUBQ $64, CX - JMP finalize - -between_0_and_64: - VMOVDQA X0, X4 - VMOVDQA X1, X5 - VMOVDQA X2, X6 - VMOVDQA X3, X7 - - MOVQ DX, R9 - -chacha_loop_64: - CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15) - CHACHA_SHUFFLE_AVX(X5, X6, X7) - CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15) - CHACHA_SHUFFLE_AVX(X7, X6, X5) - SUBQ $2, R9 - JA chacha_loop_64 - - VPADDD X0, X4, X4 - VPADDD X1, X5, X5 - VPADDD X2, X6, X6 - VPADDD X3, X7, X7 - VMOVDQU ·one<>(SB), X0 - VPADDQ X0, X3, X3 - - CMPQ CX, $64 - JB less_than_64 - - XOR_AVX(DI, SI, 0, X4, X5, X6, X7, X13) - SUBQ $64, CX - JMP done - -less_than_64: - VMOVDQU X4, 0(BX) - VMOVDQU X5, 16(BX) - VMOVDQU X6, 32(BX) - VMOVDQU X7, 48(BX) - -finalize: - XORQ R11, R11 - XORQ R12, R12 - MOVQ CX, BP - -xor_loop: - MOVB 0(SI), R11 - MOVB 0(BX), R12 - XORQ R11, R12 - MOVB R12, 0(DI) - INCQ SI - INCQ BX - INCQ DI - DECQ BP - JA xor_loop - -done: - VMOVDQU X3, 48(AX) - VZEROUPPER - MOVQ R8, SP - MOVQ CX, ret+72(FP) - RET - diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_386.go b/vendor/github.com/aead/chacha20/chacha/chacha_386.go deleted file mode 100644 index 97e533d..0000000 --- a/vendor/github.com/aead/chacha20/chacha/chacha_386.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// +build 386,!gccgo,!appengine,!nacl - -package chacha - -import ( - "encoding/binary" - - "golang.org/x/sys/cpu" -) - -func init() { - useSSE2 = cpu.X86.HasSSE2 - useSSSE3 = cpu.X86.HasSSSE3 - useAVX = false - useAVX2 = false -} - -func initialize(state *[64]byte, key []byte, nonce *[16]byte) { - binary.LittleEndian.PutUint32(state[0:], sigma[0]) - binary.LittleEndian.PutUint32(state[4:], sigma[1]) - binary.LittleEndian.PutUint32(state[8:], sigma[2]) - binary.LittleEndian.PutUint32(state[12:], sigma[3]) - copy(state[16:], key[:]) - copy(state[48:], nonce[:]) -} - -// This function is implemented in chacha_386.s -//go:noescape -func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) - -// This function is implemented in chacha_386.s -//go:noescape -func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) - -// This function is implemented in chacha_386.s -//go:noescape -func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int - -func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { - switch { - case useSSSE3: - hChaCha20SSSE3(out, nonce, key) - case useSSE2: - hChaCha20SSE2(out, nonce, key) - default: - hChaCha20Generic(out, nonce, key) - } -} - -func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int { - if useSSE2 { - return xorKeyStreamSSE2(dst, src, block, state, rounds) - } else { - return xorKeyStreamGeneric(dst, src, block, state, rounds) - } -} diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_386.s b/vendor/github.com/aead/chacha20/chacha/chacha_386.s deleted file mode 100644 index 262fc86..0000000 --- a/vendor/github.com/aead/chacha20/chacha/chacha_386.s +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// +build 386,!gccgo,!appengine,!nacl - -#include "const.s" -#include "macro.s" - -// FINALIZE xors len bytes from src and block using -// the temp. registers t0 and t1 and writes the result -// to dst. -#define FINALIZE(dst, src, block, len, t0, t1) \ - XORL t0, t0; \ - XORL t1, t1; \ - FINALIZE_LOOP:; \ - MOVB 0(src), t0; \ - MOVB 0(block), t1; \ - XORL t0, t1; \ - MOVB t1, 0(dst); \ - INCL src; \ - INCL block; \ - INCL dst; \ - DECL len; \ - JG FINALIZE_LOOP \ - -#define Dst DI -#define Nonce AX -#define Key BX -#define Rounds DX - -// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) -TEXT ·hChaCha20SSE2(SB), 4, $0-12 - MOVL out+0(FP), Dst - MOVL nonce+4(FP), Nonce - MOVL key+8(FP), Key - - MOVOU ·sigma<>(SB), X0 - MOVOU 0*16(Key), X1 - MOVOU 1*16(Key), X2 - MOVOU 0*16(Nonce), X3 - MOVL $20, Rounds - -chacha_loop: - CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) - CHACHA_SHUFFLE_SSE(X1, X2, X3) - CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) - CHACHA_SHUFFLE_SSE(X3, X2, X1) - SUBL $2, Rounds - JNZ chacha_loop - - MOVOU X0, 0*16(Dst) - MOVOU X3, 1*16(Dst) - RET - -// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) -TEXT ·hChaCha20SSSE3(SB), 4, $0-12 - MOVL out+0(FP), Dst - MOVL nonce+4(FP), Nonce - MOVL key+8(FP), Key - - MOVOU ·sigma<>(SB), X0 - MOVOU 0*16(Key), X1 - MOVOU 1*16(Key), X2 - MOVOU 0*16(Nonce), X3 - MOVL $20, Rounds - - MOVOU ·rol16<>(SB), X5 - MOVOU ·rol8<>(SB), X6 - -chacha_loop: - CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) - CHACHA_SHUFFLE_SSE(X1, X2, X3) - CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) - CHACHA_SHUFFLE_SSE(X3, X2, X1) - SUBL $2, Rounds - JNZ chacha_loop - - MOVOU X0, 0*16(Dst) - MOVOU X3, 1*16(Dst) - RET - -#undef Dst -#undef Nonce -#undef Key -#undef Rounds - -#define State AX -#define Dst DI -#define Src SI -#define Len DX -#define Tmp0 BX -#define Tmp1 BP - -// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int -TEXT ·xorKeyStreamSSE2(SB), 4, $0-40 - MOVL dst_base+0(FP), Dst - MOVL src_base+12(FP), Src - MOVL state+28(FP), State - MOVL src_len+16(FP), Len - MOVL $0, ret+36(FP) // Number of bytes written to the keystream buffer - 0 iff len mod 64 == 0 - - MOVOU 0*16(State), X0 - MOVOU 1*16(State), X1 - MOVOU 2*16(State), X2 - MOVOU 3*16(State), X3 - TESTL Len, Len - JZ DONE - -GENERATE_KEYSTREAM: - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X3, X7 - MOVL rounds+32(FP), Tmp0 - -CHACHA_LOOP: - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBL $2, Tmp0 - JA CHACHA_LOOP - - MOVOU 0*16(State), X0 // Restore X0 from state - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - MOVOU ·one<>(SB), X0 - PADDQ X0, X3 - - CMPL Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0) - MOVOU 0*16(State), X0 // Restore X0 from state - ADDL $64, Src - ADDL $64, Dst - SUBL $64, Len - JZ DONE - JMP GENERATE_KEYSTREAM // There is at least one more plaintext byte - -BUFFER_KEYSTREAM: - MOVL block+24(FP), State - MOVOU X4, 0(State) - MOVOU X5, 16(State) - MOVOU X6, 32(State) - MOVOU X7, 48(State) - MOVL Len, ret+36(FP) // Number of bytes written to the keystream buffer - 0 < Len < 64 - FINALIZE(Dst, Src, State, Len, Tmp0, Tmp1) - -DONE: - MOVL state+28(FP), State - MOVOU X3, 3*16(State) - RET - -#undef State -#undef Dst -#undef Src -#undef Len -#undef Tmp0 -#undef Tmp1 diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_amd64.go b/vendor/github.com/aead/chacha20/chacha/chacha_amd64.go deleted file mode 100644 index 635f7de..0000000 --- a/vendor/github.com/aead/chacha20/chacha/chacha_amd64.go +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2017 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// +build go1.7,amd64,!gccgo,!appengine,!nacl - -package chacha - -import "golang.org/x/sys/cpu" - -func init() { - useSSE2 = cpu.X86.HasSSE2 - useSSSE3 = cpu.X86.HasSSSE3 - useAVX = cpu.X86.HasAVX - useAVX2 = cpu.X86.HasAVX2 -} - -// This function is implemented in chacha_amd64.s -//go:noescape -func initialize(state *[64]byte, key []byte, nonce *[16]byte) - -// This function is implemented in chacha_amd64.s -//go:noescape -func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) - -// This function is implemented in chacha_amd64.s -//go:noescape -func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) - -// This function is implemented in chachaAVX2_amd64.s -//go:noescape -func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte) - -// This function is implemented in chacha_amd64.s -//go:noescape -func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int - -// This function is implemented in chacha_amd64.s -//go:noescape -func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int - -// This function is implemented in chacha_amd64.s -//go:noescape -func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int - -// This function is implemented in chachaAVX2_amd64.s -//go:noescape -func xorKeyStreamAVX2(dst, src []byte, block, state *[64]byte, rounds int) int - -func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { - switch { - case useAVX: - hChaCha20AVX(out, nonce, key) - case useSSSE3: - hChaCha20SSSE3(out, nonce, key) - case useSSE2: - hChaCha20SSE2(out, nonce, key) - default: - hChaCha20Generic(out, nonce, key) - } -} - -func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int { - switch { - case useAVX2: - return xorKeyStreamAVX2(dst, src, block, state, rounds) - case useAVX: - return xorKeyStreamAVX(dst, src, block, state, rounds) - case useSSSE3: - return xorKeyStreamSSSE3(dst, src, block, state, rounds) - case useSSE2: - return xorKeyStreamSSE2(dst, src, block, state, rounds) - default: - return xorKeyStreamGeneric(dst, src, block, state, rounds) - } -} diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_amd64.s b/vendor/github.com/aead/chacha20/chacha/chacha_amd64.s deleted file mode 100644 index 26a2383..0000000 --- a/vendor/github.com/aead/chacha20/chacha/chacha_amd64.s +++ /dev/null @@ -1,1072 +0,0 @@ -// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// +build amd64,!gccgo,!appengine,!nacl - -#include "const.s" -#include "macro.s" - -// FINALIZE xors len bytes from src and block using -// the temp. registers t0 and t1 and writes the result -// to dst. -#define FINALIZE(dst, src, block, len, t0, t1) \ - XORQ t0, t0; \ - XORQ t1, t1; \ - FINALIZE_LOOP:; \ - MOVB 0(src), t0; \ - MOVB 0(block), t1; \ - XORQ t0, t1; \ - MOVB t1, 0(dst); \ - INCQ src; \ - INCQ block; \ - INCQ dst; \ - DECQ len; \ - JG FINALIZE_LOOP \ - -#define Dst DI -#define Nonce AX -#define Key BX -#define Rounds DX - -// func initialize(state *[64]byte, key []byte, nonce *[16]byte) -TEXT ·initialize(SB), 4, $0-40 - MOVQ state+0(FP), Dst - MOVQ key+8(FP), Key - MOVQ nonce+32(FP), Nonce - - MOVOU ·sigma<>(SB), X0 - MOVOU 0*16(Key), X1 - MOVOU 1*16(Key), X2 - MOVOU 0*16(Nonce), X3 - - MOVOU X0, 0*16(Dst) - MOVOU X1, 1*16(Dst) - MOVOU X2, 2*16(Dst) - MOVOU X3, 3*16(Dst) - RET - -// func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte) -TEXT ·hChaCha20AVX(SB), 4, $0-24 - MOVQ out+0(FP), Dst - MOVQ nonce+8(FP), Nonce - MOVQ key+16(FP), Key - - VMOVDQU ·sigma<>(SB), X0 - VMOVDQU 0*16(Key), X1 - VMOVDQU 1*16(Key), X2 - VMOVDQU 0*16(Nonce), X3 - VMOVDQU ·rol16_AVX2<>(SB), X5 - VMOVDQU ·rol8_AVX2<>(SB), X6 - MOVQ $20, Rounds - -CHACHA_LOOP: - CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, X5, X6) - CHACHA_SHUFFLE_AVX(X1, X2, X3) - CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, X5, X6) - CHACHA_SHUFFLE_AVX(X3, X2, X1) - SUBQ $2, Rounds - JNZ CHACHA_LOOP - - VMOVDQU X0, 0*16(Dst) - VMOVDQU X3, 1*16(Dst) - VZEROUPPER - RET - -// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) -TEXT ·hChaCha20SSE2(SB), 4, $0-24 - MOVQ out+0(FP), Dst - MOVQ nonce+8(FP), Nonce - MOVQ key+16(FP), Key - - MOVOU ·sigma<>(SB), X0 - MOVOU 0*16(Key), X1 - MOVOU 1*16(Key), X2 - MOVOU 0*16(Nonce), X3 - MOVQ $20, Rounds - -CHACHA_LOOP: - CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) - CHACHA_SHUFFLE_SSE(X1, X2, X3) - CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) - CHACHA_SHUFFLE_SSE(X3, X2, X1) - SUBQ $2, Rounds - JNZ CHACHA_LOOP - - MOVOU X0, 0*16(Dst) - MOVOU X3, 1*16(Dst) - RET - -// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) -TEXT ·hChaCha20SSSE3(SB), 4, $0-24 - MOVQ out+0(FP), Dst - MOVQ nonce+8(FP), Nonce - MOVQ key+16(FP), Key - - MOVOU ·sigma<>(SB), X0 - MOVOU 0*16(Key), X1 - MOVOU 1*16(Key), X2 - MOVOU 0*16(Nonce), X3 - MOVOU ·rol16<>(SB), X5 - MOVOU ·rol8<>(SB), X6 - MOVQ $20, Rounds - -chacha_loop: - CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) - CHACHA_SHUFFLE_SSE(X1, X2, X3) - CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) - CHACHA_SHUFFLE_SSE(X3, X2, X1) - SUBQ $2, Rounds - JNZ chacha_loop - - MOVOU X0, 0*16(Dst) - MOVOU X3, 1*16(Dst) - RET - -#undef Dst -#undef Nonce -#undef Key -#undef Rounds - -#define Dst DI -#define Src SI -#define Len R12 -#define Rounds DX -#define Buffer BX -#define State AX -#define Stack SP -#define SavedSP R8 -#define Tmp0 R9 -#define Tmp1 R10 -#define Tmp2 R11 - -// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int -TEXT ·xorKeyStreamSSE2(SB), 4, $112-80 - MOVQ dst_base+0(FP), Dst - MOVQ src_base+24(FP), Src - MOVQ block+48(FP), Buffer - MOVQ state+56(FP), State - MOVQ rounds+64(FP), Rounds - MOVQ src_len+32(FP), Len - - MOVOU 0*16(State), X0 - MOVOU 1*16(State), X1 - MOVOU 2*16(State), X2 - MOVOU 3*16(State), X3 - - MOVQ Stack, SavedSP - ADDQ $16, Stack - ANDQ $-16, Stack - - TESTQ Len, Len - JZ DONE - - MOVOU ·one<>(SB), X4 - MOVO X0, 0*16(Stack) - MOVO X1, 1*16(Stack) - MOVO X2, 2*16(Stack) - MOVO X3, 3*16(Stack) - MOVO X4, 4*16(Stack) - - CMPQ Len, $64 - JLE GENERATE_KEYSTREAM_64 - CMPQ Len, $128 - JLE GENERATE_KEYSTREAM_128 - CMPQ Len, $192 - JLE GENERATE_KEYSTREAM_192 - -GENERATE_KEYSTREAM_256: - MOVO X0, X12 - MOVO X1, X13 - MOVO X2, X14 - MOVO X3, X15 - PADDQ 4*16(Stack), X15 - MOVO X0, X8 - MOVO X1, X9 - MOVO X2, X10 - MOVO X15, X11 - PADDQ 4*16(Stack), X11 - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X11, X7 - PADDQ 4*16(Stack), X7 - MOVQ Rounds, Tmp0 - - MOVO X3, 3*16(Stack) // Save X3 - -CHACHA_LOOP_256: - MOVO X4, 5*16(Stack) - CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) - CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4) - MOVO 5*16(Stack), X4 - MOVO X0, 5*16(Stack) - CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) - MOVO 5*16(Stack), X0 - CHACHA_SHUFFLE_SSE(X1, X2, X3) - CHACHA_SHUFFLE_SSE(X13, X14, X15) - CHACHA_SHUFFLE_SSE(X9, X10, X11) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - MOVO X4, 5*16(Stack) - CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) - CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4) - MOVO 5*16(Stack), X4 - MOVO X0, 5*16(Stack) - CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) - MOVO 5*16(Stack), X0 - CHACHA_SHUFFLE_SSE(X3, X2, X1) - CHACHA_SHUFFLE_SSE(X15, X14, X13) - CHACHA_SHUFFLE_SSE(X11, X10, X9) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_256 - - PADDL 0*16(Stack), X0 - PADDL 1*16(Stack), X1 - PADDL 2*16(Stack), X2 - PADDL 3*16(Stack), X3 - MOVO X4, 5*16(Stack) // Save X4 - XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4) - MOVO 5*16(Stack), X4 // Restore X4 - - MOVO 0*16(Stack), X0 - MOVO 1*16(Stack), X1 - MOVO 2*16(Stack), X2 - MOVO 3*16(Stack), X3 - PADDQ 4*16(Stack), X3 - - PADDL X0, X12 - PADDL X1, X13 - PADDL X2, X14 - PADDL X3, X15 - PADDQ 4*16(Stack), X3 - PADDL X0, X8 - PADDL X1, X9 - PADDL X2, X10 - PADDL X3, X11 - PADDQ 4*16(Stack), X3 - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - PADDQ 4*16(Stack), X3 - - XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0) - XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0) - MOVO 0*16(Stack), X0 // Restore X0 - ADDQ $192, Dst - ADDQ $192, Src - SUBQ $192, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE - CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. - JLE GENERATE_KEYSTREAM_64 - CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. - JLE GENERATE_KEYSTREAM_128 - CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream - JG GENERATE_KEYSTREAM_256 - -GENERATE_KEYSTREAM_192: - MOVO X0, X12 - MOVO X1, X13 - MOVO X2, X14 - MOVO X3, X15 - MOVO X0, X8 - MOVO X1, X9 - MOVO X2, X10 - MOVO X3, X11 - PADDQ 4*16(Stack), X11 - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X11, X7 - PADDQ 4*16(Stack), X7 - MOVQ Rounds, Tmp0 - -CHACHA_LOOP_192: - CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0) - CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) - CHACHA_SHUFFLE_SSE(X13, X14, X15) - CHACHA_SHUFFLE_SSE(X9, X10, X11) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0) - CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) - CHACHA_SHUFFLE_SSE(X15, X14, X13) - CHACHA_SHUFFLE_SSE(X11, X10, X9) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_192 - - MOVO 0*16(Stack), X0 // Restore X0 - PADDL X0, X12 - PADDL X1, X13 - PADDL X2, X14 - PADDL X3, X15 - PADDQ 4*16(Stack), X3 - PADDL X0, X8 - PADDL X1, X9 - PADDL X2, X10 - PADDL X3, X11 - PADDQ 4*16(Stack), X3 - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - PADDQ 4*16(Stack), X3 - - XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0) - XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0) - MOVO 0*16(Stack), X0 // Restore X0 - ADDQ $128, Dst - ADDQ $128, Src - SUBQ $128, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE - CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. - JLE GENERATE_KEYSTREAM_64 - -GENERATE_KEYSTREAM_128: - MOVO X0, X8 - MOVO X1, X9 - MOVO X2, X10 - MOVO X3, X11 - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X3, X7 - PADDQ 4*16(Stack), X7 - MOVQ Rounds, Tmp0 - -CHACHA_LOOP_128: - CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12) - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12) - CHACHA_SHUFFLE_SSE(X9, X10, X11) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12) - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12) - CHACHA_SHUFFLE_SSE(X11, X10, X9) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_128 - - PADDL X0, X8 - PADDL X1, X9 - PADDL X2, X10 - PADDL X3, X11 - PADDQ 4*16(Stack), X3 - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - PADDQ 4*16(Stack), X3 - - XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream - -GENERATE_KEYSTREAM_64: - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X3, X7 - MOVQ Rounds, Tmp0 - -CHACHA_LOOP_64: - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_64 - - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - PADDQ 4*16(Stack), X3 - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Src - ADDQ $64, Dst - SUBQ $64, Len - JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. - -BUFFER_KEYSTREAM: - MOVOU X4, 0*16(Buffer) - MOVOU X5, 1*16(Buffer) - MOVOU X6, 2*16(Buffer) - MOVOU X7, 3*16(Buffer) - MOVQ Len, Tmp0 - FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) - -DONE: - MOVQ SavedSP, Stack // Restore stack pointer - MOVOU X3, 3*16(State) - MOVQ Len, ret+72(FP) - RET - -// func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int -TEXT ·xorKeyStreamSSSE3(SB), 4, $144-80 - MOVQ dst_base+0(FP), Dst - MOVQ src_base+24(FP), Src - MOVQ block+48(FP), Buffer - MOVQ state+56(FP), State - MOVQ rounds+64(FP), Rounds - MOVQ src_len+32(FP), Len - - MOVOU 0*16(State), X0 - MOVOU 1*16(State), X1 - MOVOU 2*16(State), X2 - MOVOU 3*16(State), X3 - - MOVQ Stack, SavedSP - ADDQ $16, Stack - ANDQ $-16, Stack - - TESTQ Len, Len - JZ DONE - - MOVOU ·one<>(SB), X4 - MOVOU ·rol16<>(SB), X5 - MOVOU ·rol8<>(SB), X6 - MOVO X0, 0*16(Stack) - MOVO X1, 1*16(Stack) - MOVO X2, 2*16(Stack) - MOVO X3, 3*16(Stack) - MOVO X4, 4*16(Stack) - MOVO X5, 6*16(Stack) - MOVO X6, 7*16(Stack) - - CMPQ Len, $64 - JLE GENERATE_KEYSTREAM_64 - CMPQ Len, $128 - JLE GENERATE_KEYSTREAM_128 - CMPQ Len, $192 - JLE GENERATE_KEYSTREAM_192 - -GENERATE_KEYSTREAM_256: - MOVO X0, X12 - MOVO X1, X13 - MOVO X2, X14 - MOVO X3, X15 - PADDQ 4*16(Stack), X15 - MOVO X0, X8 - MOVO X1, X9 - MOVO X2, X10 - MOVO X15, X11 - PADDQ 4*16(Stack), X11 - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X11, X7 - PADDQ 4*16(Stack), X7 - MOVQ Rounds, Tmp0 - - MOVO X3, 3*16(Stack) // Save X3 - -CHACHA_LOOP_256: - MOVO X4, 5*16(Stack) - CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) - CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) - MOVO 5*16(Stack), X4 - MOVO X0, 5*16(Stack) - CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) - CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) - MOVO 5*16(Stack), X0 - CHACHA_SHUFFLE_SSE(X1, X2, X3) - CHACHA_SHUFFLE_SSE(X13, X14, X15) - CHACHA_SHUFFLE_SSE(X9, X10, X11) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - MOVO X4, 5*16(Stack) - CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) - CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) - MOVO 5*16(Stack), X4 - MOVO X0, 5*16(Stack) - CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) - CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) - MOVO 5*16(Stack), X0 - CHACHA_SHUFFLE_SSE(X3, X2, X1) - CHACHA_SHUFFLE_SSE(X15, X14, X13) - CHACHA_SHUFFLE_SSE(X11, X10, X9) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_256 - - PADDL 0*16(Stack), X0 - PADDL 1*16(Stack), X1 - PADDL 2*16(Stack), X2 - PADDL 3*16(Stack), X3 - MOVO X4, 5*16(Stack) // Save X4 - XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4) - MOVO 5*16(Stack), X4 // Restore X4 - - MOVO 0*16(Stack), X0 - MOVO 1*16(Stack), X1 - MOVO 2*16(Stack), X2 - MOVO 3*16(Stack), X3 - PADDQ 4*16(Stack), X3 - - PADDL X0, X12 - PADDL X1, X13 - PADDL X2, X14 - PADDL X3, X15 - PADDQ 4*16(Stack), X3 - PADDL X0, X8 - PADDL X1, X9 - PADDL X2, X10 - PADDL X3, X11 - PADDQ 4*16(Stack), X3 - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - PADDQ 4*16(Stack), X3 - - XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0) - XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0) - MOVO 0*16(Stack), X0 // Restore X0 - ADDQ $192, Dst - ADDQ $192, Src - SUBQ $192, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE - CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. - JLE GENERATE_KEYSTREAM_64 - CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. - JLE GENERATE_KEYSTREAM_128 - CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream - JG GENERATE_KEYSTREAM_256 - -GENERATE_KEYSTREAM_192: - MOVO X0, X12 - MOVO X1, X13 - MOVO X2, X14 - MOVO X3, X15 - MOVO X0, X8 - MOVO X1, X9 - MOVO X2, X10 - MOVO X3, X11 - PADDQ 4*16(Stack), X11 - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X11, X7 - PADDQ 4*16(Stack), X7 - MOVQ Rounds, Tmp0 - - MOVO 6*16(Stack), X1 // Load 16 bit rotate-left constant - MOVO 7*16(Stack), X2 // Load 8 bit rotate-left constant - -CHACHA_LOOP_192: - CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2) - CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2) - CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2) - CHACHA_SHUFFLE_SSE(X13, X14, X15) - CHACHA_SHUFFLE_SSE(X9, X10, X11) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2) - CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2) - CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2) - CHACHA_SHUFFLE_SSE(X15, X14, X13) - CHACHA_SHUFFLE_SSE(X11, X10, X9) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_192 - - MOVO 0*16(Stack), X0 // Restore X0 - MOVO 1*16(Stack), X1 // Restore X1 - MOVO 2*16(Stack), X2 // Restore X2 - PADDL X0, X12 - PADDL X1, X13 - PADDL X2, X14 - PADDL X3, X15 - PADDQ 4*16(Stack), X3 - PADDL X0, X8 - PADDL X1, X9 - PADDL X2, X10 - PADDL X3, X11 - PADDQ 4*16(Stack), X3 - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - PADDQ 4*16(Stack), X3 - - XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0) - XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0) - MOVO 0*16(Stack), X0 // Restore X0 - ADDQ $128, Dst - ADDQ $128, Src - SUBQ $128, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE - CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. - JLE GENERATE_KEYSTREAM_64 - -GENERATE_KEYSTREAM_128: - MOVO X0, X8 - MOVO X1, X9 - MOVO X2, X10 - MOVO X3, X11 - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X3, X7 - PADDQ 4*16(Stack), X7 - MOVQ Rounds, Tmp0 - - MOVO 6*16(Stack), X13 // Load 16 bit rotate-left constant - MOVO 7*16(Stack), X14 // Load 8 bit rotate-left constant - -CHACHA_LOOP_128: - CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14) - CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14) - CHACHA_SHUFFLE_SSE(X9, X10, X11) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14) - CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14) - CHACHA_SHUFFLE_SSE(X11, X10, X9) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_128 - - PADDL X0, X8 - PADDL X1, X9 - PADDL X2, X10 - PADDL X3, X11 - PADDQ 4*16(Stack), X3 - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - PADDQ 4*16(Stack), X3 - - XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream - -GENERATE_KEYSTREAM_64: - MOVO X0, X4 - MOVO X1, X5 - MOVO X2, X6 - MOVO X3, X7 - MOVQ Rounds, Tmp0 - - MOVO 6*16(Stack), X9 // Load 16 bit rotate-left constant - MOVO 7*16(Stack), X10 // Load 8 bit rotate-left constant - -CHACHA_LOOP_64: - CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10) - CHACHA_SHUFFLE_SSE(X5, X6, X7) - CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10) - CHACHA_SHUFFLE_SSE(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_64 - - PADDL X0, X4 - PADDL X1, X5 - PADDL X2, X6 - PADDL X3, X7 - PADDQ 4*16(Stack), X3 - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Src - ADDQ $64, Dst - SUBQ $64, Len - JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. - -BUFFER_KEYSTREAM: - MOVOU X4, 0*16(Buffer) - MOVOU X5, 1*16(Buffer) - MOVOU X6, 2*16(Buffer) - MOVOU X7, 3*16(Buffer) - MOVQ Len, Tmp0 - FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) - -DONE: - MOVQ SavedSP, Stack // Restore stack pointer - MOVOU X3, 3*16(State) - MOVQ Len, ret+72(FP) - RET - -// func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int -TEXT ·xorKeyStreamAVX(SB), 4, $144-80 - MOVQ dst_base+0(FP), Dst - MOVQ src_base+24(FP), Src - MOVQ block+48(FP), Buffer - MOVQ state+56(FP), State - MOVQ rounds+64(FP), Rounds - MOVQ src_len+32(FP), Len - - VMOVDQU 0*16(State), X0 - VMOVDQU 1*16(State), X1 - VMOVDQU 2*16(State), X2 - VMOVDQU 3*16(State), X3 - - MOVQ Stack, SavedSP - ADDQ $16, Stack - ANDQ $-16, Stack - - TESTQ Len, Len - JZ DONE - - VMOVDQU ·one<>(SB), X4 - VMOVDQU ·rol16<>(SB), X5 - VMOVDQU ·rol8<>(SB), X6 - VMOVDQA X0, 0*16(Stack) - VMOVDQA X1, 1*16(Stack) - VMOVDQA X2, 2*16(Stack) - VMOVDQA X3, 3*16(Stack) - VMOVDQA X4, 4*16(Stack) - VMOVDQA X5, 6*16(Stack) - VMOVDQA X6, 7*16(Stack) - - CMPQ Len, $64 - JLE GENERATE_KEYSTREAM_64 - CMPQ Len, $128 - JLE GENERATE_KEYSTREAM_128 - CMPQ Len, $192 - JLE GENERATE_KEYSTREAM_192 - -GENERATE_KEYSTREAM_256: - VMOVDQA X0, X12 - VMOVDQA X1, X13 - VMOVDQA X2, X14 - VMOVDQA X3, X15 - VPADDQ 4*16(Stack), X15, X15 - VMOVDQA X0, X8 - VMOVDQA X1, X9 - VMOVDQA X2, X10 - VMOVDQA X15, X11 - VPADDQ 4*16(Stack), X11, X11 - VMOVDQA X0, X4 - VMOVDQA X1, X5 - VMOVDQA X2, X6 - VMOVDQA X11, X7 - VPADDQ 4*16(Stack), X7, X7 - MOVQ Rounds, Tmp0 - - VMOVDQA X3, 3*16(Stack) // Save X3 - -CHACHA_LOOP_256: - VMOVDQA X4, 5*16(Stack) - CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) - CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) - VMOVDQA 5*16(Stack), X4 - VMOVDQA X0, 5*16(Stack) - CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) - CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) - VMOVDQA 5*16(Stack), X0 - CHACHA_SHUFFLE_AVX(X1, X2, X3) - CHACHA_SHUFFLE_AVX(X13, X14, X15) - CHACHA_SHUFFLE_AVX(X9, X10, X11) - CHACHA_SHUFFLE_AVX(X5, X6, X7) - VMOVDQA X4, 5*16(Stack) - CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) - CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) - VMOVDQA 5*16(Stack), X4 - VMOVDQA X0, 5*16(Stack) - CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) - CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) - VMOVDQA 5*16(Stack), X0 - CHACHA_SHUFFLE_AVX(X3, X2, X1) - CHACHA_SHUFFLE_AVX(X15, X14, X13) - CHACHA_SHUFFLE_AVX(X11, X10, X9) - CHACHA_SHUFFLE_AVX(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_256 - - VPADDD 0*16(Stack), X0, X0 - VPADDD 1*16(Stack), X1, X1 - VPADDD 2*16(Stack), X2, X2 - VPADDD 3*16(Stack), X3, X3 - VMOVDQA X4, 5*16(Stack) // Save X4 - XOR_AVX(Dst, Src, 0, X0, X1, X2, X3, X4) - VMOVDQA 5*16(Stack), X4 // Restore X4 - - VMOVDQA 0*16(Stack), X0 - VMOVDQA 1*16(Stack), X1 - VMOVDQA 2*16(Stack), X2 - VMOVDQA 3*16(Stack), X3 - VPADDQ 4*16(Stack), X3, X3 - - VPADDD X0, X12, X12 - VPADDD X1, X13, X13 - VPADDD X2, X14, X14 - VPADDD X3, X15, X15 - VPADDQ 4*16(Stack), X3, X3 - VPADDD X0, X8, X8 - VPADDD X1, X9, X9 - VPADDD X2, X10, X10 - VPADDD X3, X11, X11 - VPADDQ 4*16(Stack), X3, X3 - VPADDD X0, X4, X4 - VPADDD X1, X5, X5 - VPADDD X2, X6, X6 - VPADDD X3, X7, X7 - VPADDQ 4*16(Stack), X3, X3 - - XOR_AVX(Dst, Src, 64, X12, X13, X14, X15, X0) - XOR_AVX(Dst, Src, 128, X8, X9, X10, X11, X0) - VMOVDQA 0*16(Stack), X0 // Restore X0 - ADDQ $192, Dst - ADDQ $192, Src - SUBQ $192, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE - CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. - JLE GENERATE_KEYSTREAM_64 - CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. - JLE GENERATE_KEYSTREAM_128 - CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream - JG GENERATE_KEYSTREAM_256 - -GENERATE_KEYSTREAM_192: - VMOVDQA X0, X12 - VMOVDQA X1, X13 - VMOVDQA X2, X14 - VMOVDQA X3, X15 - VMOVDQA X0, X8 - VMOVDQA X1, X9 - VMOVDQA X2, X10 - VMOVDQA X3, X11 - VPADDQ 4*16(Stack), X11, X11 - VMOVDQA X0, X4 - VMOVDQA X1, X5 - VMOVDQA X2, X6 - VMOVDQA X11, X7 - VPADDQ 4*16(Stack), X7, X7 - MOVQ Rounds, Tmp0 - - VMOVDQA 6*16(Stack), X1 // Load 16 bit rotate-left constant - VMOVDQA 7*16(Stack), X2 // Load 8 bit rotate-left constant - -CHACHA_LOOP_192: - CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2) - CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2) - CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2) - CHACHA_SHUFFLE_AVX(X13, X14, X15) - CHACHA_SHUFFLE_AVX(X9, X10, X11) - CHACHA_SHUFFLE_AVX(X5, X6, X7) - CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2) - CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2) - CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2) - CHACHA_SHUFFLE_AVX(X15, X14, X13) - CHACHA_SHUFFLE_AVX(X11, X10, X9) - CHACHA_SHUFFLE_AVX(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_192 - - VMOVDQA 0*16(Stack), X0 // Restore X0 - VMOVDQA 1*16(Stack), X1 // Restore X1 - VMOVDQA 2*16(Stack), X2 // Restore X2 - VPADDD X0, X12, X12 - VPADDD X1, X13, X13 - VPADDD X2, X14, X14 - VPADDD X3, X15, X15 - VPADDQ 4*16(Stack), X3, X3 - VPADDD X0, X8, X8 - VPADDD X1, X9, X9 - VPADDD X2, X10, X10 - VPADDD X3, X11, X11 - VPADDQ 4*16(Stack), X3, X3 - VPADDD X0, X4, X4 - VPADDD X1, X5, X5 - VPADDD X2, X6, X6 - VPADDD X3, X7, X7 - VPADDQ 4*16(Stack), X3, X3 - - XOR_AVX(Dst, Src, 0, X12, X13, X14, X15, X0) - XOR_AVX(Dst, Src, 64, X8, X9, X10, X11, X0) - VMOVDQA 0*16(Stack), X0 // Restore X0 - ADDQ $128, Dst - ADDQ $128, Src - SUBQ $128, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE - CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. - JLE GENERATE_KEYSTREAM_64 - -GENERATE_KEYSTREAM_128: - VMOVDQA X0, X8 - VMOVDQA X1, X9 - VMOVDQA X2, X10 - VMOVDQA X3, X11 - VMOVDQA X0, X4 - VMOVDQA X1, X5 - VMOVDQA X2, X6 - VMOVDQA X3, X7 - VPADDQ 4*16(Stack), X7, X7 - MOVQ Rounds, Tmp0 - - VMOVDQA 6*16(Stack), X13 // Load 16 bit rotate-left constant - VMOVDQA 7*16(Stack), X14 // Load 8 bit rotate-left constant - -CHACHA_LOOP_128: - CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14) - CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14) - CHACHA_SHUFFLE_AVX(X9, X10, X11) - CHACHA_SHUFFLE_AVX(X5, X6, X7) - CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14) - CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14) - CHACHA_SHUFFLE_AVX(X11, X10, X9) - CHACHA_SHUFFLE_AVX(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_128 - - VPADDD X0, X8, X8 - VPADDD X1, X9, X9 - VPADDD X2, X10, X10 - VPADDD X3, X11, X11 - VPADDQ 4*16(Stack), X3, X3 - VPADDD X0, X4, X4 - VPADDD X1, X5, X5 - VPADDD X2, X6, X6 - VPADDD X3, X7, X7 - VPADDQ 4*16(Stack), X3, X3 - - XOR_AVX(Dst, Src, 0, X8, X9, X10, X11, X12) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Dst - ADDQ $64, Src - SUBQ $64, Len - JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream - -GENERATE_KEYSTREAM_64: - VMOVDQA X0, X4 - VMOVDQA X1, X5 - VMOVDQA X2, X6 - VMOVDQA X3, X7 - MOVQ Rounds, Tmp0 - - VMOVDQA 6*16(Stack), X9 // Load 16 bit rotate-left constant - VMOVDQA 7*16(Stack), X10 // Load 8 bit rotate-left constant - -CHACHA_LOOP_64: - CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10) - CHACHA_SHUFFLE_AVX(X5, X6, X7) - CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10) - CHACHA_SHUFFLE_AVX(X7, X6, X5) - SUBQ $2, Tmp0 - JNZ CHACHA_LOOP_64 - - VPADDD X0, X4, X4 - VPADDD X1, X5, X5 - VPADDD X2, X6, X6 - VPADDD X3, X7, X7 - VPADDQ 4*16(Stack), X3, X3 - - CMPQ Len, $64 - JL BUFFER_KEYSTREAM - - XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) - ADDQ $64, Src - ADDQ $64, Dst - SUBQ $64, Len - JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. - -BUFFER_KEYSTREAM: - VMOVDQU X4, 0*16(Buffer) - VMOVDQU X5, 1*16(Buffer) - VMOVDQU X6, 2*16(Buffer) - VMOVDQU X7, 3*16(Buffer) - MOVQ Len, Tmp0 - FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) - -DONE: - MOVQ SavedSP, Stack // Restore stack pointer - VMOVDQU X3, 3*16(State) - VZEROUPPER - MOVQ Len, ret+72(FP) - RET - -#undef Dst -#undef Src -#undef Len -#undef Rounds -#undef Buffer -#undef State -#undef Stack -#undef SavedSP -#undef Tmp0 -#undef Tmp1 -#undef Tmp2 diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_generic.go b/vendor/github.com/aead/chacha20/chacha/chacha_generic.go deleted file mode 100644 index 8832d5b..0000000 --- a/vendor/github.com/aead/chacha20/chacha/chacha_generic.go +++ /dev/null @@ -1,319 +0,0 @@ -// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -package chacha - -import "encoding/binary" - -var sigma = [4]uint32{0x61707865, 0x3320646e, 0x79622d32, 0x6b206574} - -func xorKeyStreamGeneric(dst, src []byte, block, state *[64]byte, rounds int) int { - for len(src) >= 64 { - chachaGeneric(block, state, rounds) - - for i, v := range block { - dst[i] = src[i] ^ v - } - src = src[64:] - dst = dst[64:] - } - - n := len(src) - if n > 0 { - chachaGeneric(block, state, rounds) - for i, v := range src { - dst[i] = v ^ block[i] - } - } - return n -} - -func chachaGeneric(dst *[64]byte, state *[64]byte, rounds int) { - v00 := binary.LittleEndian.Uint32(state[0:]) - v01 := binary.LittleEndian.Uint32(state[4:]) - v02 := binary.LittleEndian.Uint32(state[8:]) - v03 := binary.LittleEndian.Uint32(state[12:]) - v04 := binary.LittleEndian.Uint32(state[16:]) - v05 := binary.LittleEndian.Uint32(state[20:]) - v06 := binary.LittleEndian.Uint32(state[24:]) - v07 := binary.LittleEndian.Uint32(state[28:]) - v08 := binary.LittleEndian.Uint32(state[32:]) - v09 := binary.LittleEndian.Uint32(state[36:]) - v10 := binary.LittleEndian.Uint32(state[40:]) - v11 := binary.LittleEndian.Uint32(state[44:]) - v12 := binary.LittleEndian.Uint32(state[48:]) - v13 := binary.LittleEndian.Uint32(state[52:]) - v14 := binary.LittleEndian.Uint32(state[56:]) - v15 := binary.LittleEndian.Uint32(state[60:]) - - s00, s01, s02, s03, s04, s05, s06, s07 := v00, v01, v02, v03, v04, v05, v06, v07 - s08, s09, s10, s11, s12, s13, s14, s15 := v08, v09, v10, v11, v12, v13, v14, v15 - - for i := 0; i < rounds; i += 2 { - v00 += v04 - v12 ^= v00 - v12 = (v12 << 16) | (v12 >> 16) - v08 += v12 - v04 ^= v08 - v04 = (v04 << 12) | (v04 >> 20) - v00 += v04 - v12 ^= v00 - v12 = (v12 << 8) | (v12 >> 24) - v08 += v12 - v04 ^= v08 - v04 = (v04 << 7) | (v04 >> 25) - v01 += v05 - v13 ^= v01 - v13 = (v13 << 16) | (v13 >> 16) - v09 += v13 - v05 ^= v09 - v05 = (v05 << 12) | (v05 >> 20) - v01 += v05 - v13 ^= v01 - v13 = (v13 << 8) | (v13 >> 24) - v09 += v13 - v05 ^= v09 - v05 = (v05 << 7) | (v05 >> 25) - v02 += v06 - v14 ^= v02 - v14 = (v14 << 16) | (v14 >> 16) - v10 += v14 - v06 ^= v10 - v06 = (v06 << 12) | (v06 >> 20) - v02 += v06 - v14 ^= v02 - v14 = (v14 << 8) | (v14 >> 24) - v10 += v14 - v06 ^= v10 - v06 = (v06 << 7) | (v06 >> 25) - v03 += v07 - v15 ^= v03 - v15 = (v15 << 16) | (v15 >> 16) - v11 += v15 - v07 ^= v11 - v07 = (v07 << 12) | (v07 >> 20) - v03 += v07 - v15 ^= v03 - v15 = (v15 << 8) | (v15 >> 24) - v11 += v15 - v07 ^= v11 - v07 = (v07 << 7) | (v07 >> 25) - v00 += v05 - v15 ^= v00 - v15 = (v15 << 16) | (v15 >> 16) - v10 += v15 - v05 ^= v10 - v05 = (v05 << 12) | (v05 >> 20) - v00 += v05 - v15 ^= v00 - v15 = (v15 << 8) | (v15 >> 24) - v10 += v15 - v05 ^= v10 - v05 = (v05 << 7) | (v05 >> 25) - v01 += v06 - v12 ^= v01 - v12 = (v12 << 16) | (v12 >> 16) - v11 += v12 - v06 ^= v11 - v06 = (v06 << 12) | (v06 >> 20) - v01 += v06 - v12 ^= v01 - v12 = (v12 << 8) | (v12 >> 24) - v11 += v12 - v06 ^= v11 - v06 = (v06 << 7) | (v06 >> 25) - v02 += v07 - v13 ^= v02 - v13 = (v13 << 16) | (v13 >> 16) - v08 += v13 - v07 ^= v08 - v07 = (v07 << 12) | (v07 >> 20) - v02 += v07 - v13 ^= v02 - v13 = (v13 << 8) | (v13 >> 24) - v08 += v13 - v07 ^= v08 - v07 = (v07 << 7) | (v07 >> 25) - v03 += v04 - v14 ^= v03 - v14 = (v14 << 16) | (v14 >> 16) - v09 += v14 - v04 ^= v09 - v04 = (v04 << 12) | (v04 >> 20) - v03 += v04 - v14 ^= v03 - v14 = (v14 << 8) | (v14 >> 24) - v09 += v14 - v04 ^= v09 - v04 = (v04 << 7) | (v04 >> 25) - } - - v00 += s00 - v01 += s01 - v02 += s02 - v03 += s03 - v04 += s04 - v05 += s05 - v06 += s06 - v07 += s07 - v08 += s08 - v09 += s09 - v10 += s10 - v11 += s11 - v12 += s12 - v13 += s13 - v14 += s14 - v15 += s15 - - s12++ - binary.LittleEndian.PutUint32(state[48:], s12) - if s12 == 0 { // indicates overflow - s13++ - binary.LittleEndian.PutUint32(state[52:], s13) - } - - binary.LittleEndian.PutUint32(dst[0:], v00) - binary.LittleEndian.PutUint32(dst[4:], v01) - binary.LittleEndian.PutUint32(dst[8:], v02) - binary.LittleEndian.PutUint32(dst[12:], v03) - binary.LittleEndian.PutUint32(dst[16:], v04) - binary.LittleEndian.PutUint32(dst[20:], v05) - binary.LittleEndian.PutUint32(dst[24:], v06) - binary.LittleEndian.PutUint32(dst[28:], v07) - binary.LittleEndian.PutUint32(dst[32:], v08) - binary.LittleEndian.PutUint32(dst[36:], v09) - binary.LittleEndian.PutUint32(dst[40:], v10) - binary.LittleEndian.PutUint32(dst[44:], v11) - binary.LittleEndian.PutUint32(dst[48:], v12) - binary.LittleEndian.PutUint32(dst[52:], v13) - binary.LittleEndian.PutUint32(dst[56:], v14) - binary.LittleEndian.PutUint32(dst[60:], v15) -} - -func hChaCha20Generic(out *[32]byte, nonce *[16]byte, key *[32]byte) { - v00 := sigma[0] - v01 := sigma[1] - v02 := sigma[2] - v03 := sigma[3] - v04 := binary.LittleEndian.Uint32(key[0:]) - v05 := binary.LittleEndian.Uint32(key[4:]) - v06 := binary.LittleEndian.Uint32(key[8:]) - v07 := binary.LittleEndian.Uint32(key[12:]) - v08 := binary.LittleEndian.Uint32(key[16:]) - v09 := binary.LittleEndian.Uint32(key[20:]) - v10 := binary.LittleEndian.Uint32(key[24:]) - v11 := binary.LittleEndian.Uint32(key[28:]) - v12 := binary.LittleEndian.Uint32(nonce[0:]) - v13 := binary.LittleEndian.Uint32(nonce[4:]) - v14 := binary.LittleEndian.Uint32(nonce[8:]) - v15 := binary.LittleEndian.Uint32(nonce[12:]) - - for i := 0; i < 20; i += 2 { - v00 += v04 - v12 ^= v00 - v12 = (v12 << 16) | (v12 >> 16) - v08 += v12 - v04 ^= v08 - v04 = (v04 << 12) | (v04 >> 20) - v00 += v04 - v12 ^= v00 - v12 = (v12 << 8) | (v12 >> 24) - v08 += v12 - v04 ^= v08 - v04 = (v04 << 7) | (v04 >> 25) - v01 += v05 - v13 ^= v01 - v13 = (v13 << 16) | (v13 >> 16) - v09 += v13 - v05 ^= v09 - v05 = (v05 << 12) | (v05 >> 20) - v01 += v05 - v13 ^= v01 - v13 = (v13 << 8) | (v13 >> 24) - v09 += v13 - v05 ^= v09 - v05 = (v05 << 7) | (v05 >> 25) - v02 += v06 - v14 ^= v02 - v14 = (v14 << 16) | (v14 >> 16) - v10 += v14 - v06 ^= v10 - v06 = (v06 << 12) | (v06 >> 20) - v02 += v06 - v14 ^= v02 - v14 = (v14 << 8) | (v14 >> 24) - v10 += v14 - v06 ^= v10 - v06 = (v06 << 7) | (v06 >> 25) - v03 += v07 - v15 ^= v03 - v15 = (v15 << 16) | (v15 >> 16) - v11 += v15 - v07 ^= v11 - v07 = (v07 << 12) | (v07 >> 20) - v03 += v07 - v15 ^= v03 - v15 = (v15 << 8) | (v15 >> 24) - v11 += v15 - v07 ^= v11 - v07 = (v07 << 7) | (v07 >> 25) - v00 += v05 - v15 ^= v00 - v15 = (v15 << 16) | (v15 >> 16) - v10 += v15 - v05 ^= v10 - v05 = (v05 << 12) | (v05 >> 20) - v00 += v05 - v15 ^= v00 - v15 = (v15 << 8) | (v15 >> 24) - v10 += v15 - v05 ^= v10 - v05 = (v05 << 7) | (v05 >> 25) - v01 += v06 - v12 ^= v01 - v12 = (v12 << 16) | (v12 >> 16) - v11 += v12 - v06 ^= v11 - v06 = (v06 << 12) | (v06 >> 20) - v01 += v06 - v12 ^= v01 - v12 = (v12 << 8) | (v12 >> 24) - v11 += v12 - v06 ^= v11 - v06 = (v06 << 7) | (v06 >> 25) - v02 += v07 - v13 ^= v02 - v13 = (v13 << 16) | (v13 >> 16) - v08 += v13 - v07 ^= v08 - v07 = (v07 << 12) | (v07 >> 20) - v02 += v07 - v13 ^= v02 - v13 = (v13 << 8) | (v13 >> 24) - v08 += v13 - v07 ^= v08 - v07 = (v07 << 7) | (v07 >> 25) - v03 += v04 - v14 ^= v03 - v14 = (v14 << 16) | (v14 >> 16) - v09 += v14 - v04 ^= v09 - v04 = (v04 << 12) | (v04 >> 20) - v03 += v04 - v14 ^= v03 - v14 = (v14 << 8) | (v14 >> 24) - v09 += v14 - v04 ^= v09 - v04 = (v04 << 7) | (v04 >> 25) - } - - binary.LittleEndian.PutUint32(out[0:], v00) - binary.LittleEndian.PutUint32(out[4:], v01) - binary.LittleEndian.PutUint32(out[8:], v02) - binary.LittleEndian.PutUint32(out[12:], v03) - binary.LittleEndian.PutUint32(out[16:], v12) - binary.LittleEndian.PutUint32(out[20:], v13) - binary.LittleEndian.PutUint32(out[24:], v14) - binary.LittleEndian.PutUint32(out[28:], v15) -} diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_ref.go b/vendor/github.com/aead/chacha20/chacha/chacha_ref.go deleted file mode 100644 index 526877c..0000000 --- a/vendor/github.com/aead/chacha20/chacha/chacha_ref.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// +build !amd64,!386 gccgo appengine nacl - -package chacha - -import "encoding/binary" - -func init() { - useSSE2 = false - useSSSE3 = false - useAVX = false - useAVX2 = false -} - -func initialize(state *[64]byte, key []byte, nonce *[16]byte) { - binary.LittleEndian.PutUint32(state[0:], sigma[0]) - binary.LittleEndian.PutUint32(state[4:], sigma[1]) - binary.LittleEndian.PutUint32(state[8:], sigma[2]) - binary.LittleEndian.PutUint32(state[12:], sigma[3]) - copy(state[16:], key[:]) - copy(state[48:], nonce[:]) -} - -func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int { - return xorKeyStreamGeneric(dst, src, block, state, rounds) -} - -func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { - hChaCha20Generic(out, nonce, key) -} diff --git a/vendor/github.com/aead/chacha20/chacha/const.s b/vendor/github.com/aead/chacha20/chacha/const.s deleted file mode 100644 index c7a94a4..0000000 --- a/vendor/github.com/aead/chacha20/chacha/const.s +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl - -#include "textflag.h" - -DATA ·sigma<>+0x00(SB)/4, $0x61707865 -DATA ·sigma<>+0x04(SB)/4, $0x3320646e -DATA ·sigma<>+0x08(SB)/4, $0x79622d32 -DATA ·sigma<>+0x0C(SB)/4, $0x6b206574 -GLOBL ·sigma<>(SB), (NOPTR+RODATA), $16 // The 4 ChaCha initialization constants - -// SSE2/SSE3/AVX constants - -DATA ·one<>+0x00(SB)/8, $1 -DATA ·one<>+0x08(SB)/8, $0 -GLOBL ·one<>(SB), (NOPTR+RODATA), $16 // The constant 1 as 128 bit value - -DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL ·rol16<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 16 bit left rotate constant - -DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -GLOBL ·rol8<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 8 bit left rotate constant - -// AVX2 constants - -DATA ·one_AVX2<>+0x00(SB)/8, $0 -DATA ·one_AVX2<>+0x08(SB)/8, $0 -DATA ·one_AVX2<>+0x10(SB)/8, $1 -DATA ·one_AVX2<>+0x18(SB)/8, $0 -GLOBL ·one_AVX2<>(SB), (NOPTR+RODATA), $32 // The constant 1 as 256 bit value - -DATA ·two_AVX2<>+0x00(SB)/8, $2 -DATA ·two_AVX2<>+0x08(SB)/8, $0 -DATA ·two_AVX2<>+0x10(SB)/8, $2 -DATA ·two_AVX2<>+0x18(SB)/8, $0 -GLOBL ·two_AVX2<>(SB), (NOPTR+RODATA), $32 - -DATA ·rol16_AVX2<>+0x00(SB)/8, $0x0504070601000302 -DATA ·rol16_AVX2<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA ·rol16_AVX2<>+0x10(SB)/8, $0x0504070601000302 -DATA ·rol16_AVX2<>+0x18(SB)/8, $0x0D0C0F0E09080B0A -GLOBL ·rol16_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 16 bit left rotate constant - -DATA ·rol8_AVX2<>+0x00(SB)/8, $0x0605040702010003 -DATA ·rol8_AVX2<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA ·rol8_AVX2<>+0x10(SB)/8, $0x0605040702010003 -DATA ·rol8_AVX2<>+0x18(SB)/8, $0x0E0D0C0F0A09080B -GLOBL ·rol8_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 8 bit left rotate constant diff --git a/vendor/github.com/aead/chacha20/chacha/macro.s b/vendor/github.com/aead/chacha20/chacha/macro.s deleted file mode 100644 index 780108f..0000000 --- a/vendor/github.com/aead/chacha20/chacha/macro.s +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (c) 2018 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl - -// ROTL_SSE rotates all 4 32 bit values of the XMM register v -// left by n bits using SSE2 instructions (0 <= n <= 32). -// The XMM register t is used as a temp. register. -#define ROTL_SSE(n, t, v) \ - MOVO v, t; \ - PSLLL $n, t; \ - PSRLL $(32-n), v; \ - PXOR t, v - -// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v -// left by n bits using AVX/AVX2 instructions (0 <= n <= 32). -// The AVX/AVX2 register t is used as a temp. register. -#define ROTL_AVX(n, t, v) \ - VPSLLD $n, v, t; \ - VPSRLD $(32-n), v, v; \ - VPXOR v, t, v - -// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the -// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for -// rotations. The XMM register t is used as a temp. register. -#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE(16, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE(12, t, v1); \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE(8, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE(7, t, v1) - -// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the -// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit -// rotations. The XMM register t is used as a temp. register. -// -// r16 holds the PSHUFB constant for a 16 bit left rotate. -// r8 holds the PSHUFB constant for a 8 bit left rotate. -#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \ - PADDL v1, v0; \ - PXOR v0, v3; \ - PSHUFB r16, v3; \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE(12, t, v1); \ - PADDL v1, v0; \ - PXOR v0, v3; \ - PSHUFB r8, v3; \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE(7, t, v1) - -// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the -// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit -// rotations. The AVX/AVX2 register t is used as a temp. register. -// -// r16 holds the VPSHUFB constant for a 16 bit left rotate. -// r8 holds the VPSHUFB constant for a 8 bit left rotate. -#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \ - VPADDD v0, v1, v0; \ - VPXOR v3, v0, v3; \ - VPSHUFB r16, v3, v3; \ - VPADDD v2, v3, v2; \ - VPXOR v1, v2, v1; \ - ROTL_AVX(12, t, v1); \ - VPADDD v0, v1, v0; \ - VPXOR v3, v0, v3; \ - VPSHUFB r8, v3, v3; \ - VPADDD v2, v3, v2; \ - VPXOR v1, v2, v1; \ - ROTL_AVX(7, t, v1) - -// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the -// 3 XMM registers v1, v2 and v3. The inverse shuffle is -// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1). -#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \ - PSHUFL $0x39, v1, v1; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v3, v3 - -// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the -// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is -// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1). -#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \ - VPSHUFD $0x39, v1, v1; \ - VPSHUFD $0x4E, v2, v2; \ - VPSHUFD $0x93, v3, v3 - -// XOR_SSE extracts 4x16 byte vectors from src at -// off, xors all vectors with the corresponding XMM -// register (v0 - v3) and writes the result to dst -// at off. -// The XMM register t is used as a temp. register. -#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \ - MOVOU 0+off(src), t; \ - PXOR v0, t; \ - MOVOU t, 0+off(dst); \ - MOVOU 16+off(src), t; \ - PXOR v1, t; \ - MOVOU t, 16+off(dst); \ - MOVOU 32+off(src), t; \ - PXOR v2, t; \ - MOVOU t, 32+off(dst); \ - MOVOU 48+off(src), t; \ - PXOR v3, t; \ - MOVOU t, 48+off(dst) - -// XOR_AVX extracts 4x16 byte vectors from src at -// off, xors all vectors with the corresponding AVX -// register (v0 - v3) and writes the result to dst -// at off. -// The XMM register t is used as a temp. register. -#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \ - VPXOR 0+off(src), v0, t; \ - VMOVDQU t, 0+off(dst); \ - VPXOR 16+off(src), v1, t; \ - VMOVDQU t, 16+off(dst); \ - VPXOR 32+off(src), v2, t; \ - VMOVDQU t, 32+off(dst); \ - VPXOR 48+off(src), v3, t; \ - VMOVDQU t, 48+off(dst) - -#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \ - VMOVDQU (0+off)(src), t0; \ - VPERM2I128 $32, v1, v0, t1; \ - VPXOR t0, t1, t0; \ - VMOVDQU t0, (0+off)(dst); \ - VMOVDQU (32+off)(src), t0; \ - VPERM2I128 $32, v3, v2, t1; \ - VPXOR t0, t1, t0; \ - VMOVDQU t0, (32+off)(dst); \ - VMOVDQU (64+off)(src), t0; \ - VPERM2I128 $49, v1, v0, t1; \ - VPXOR t0, t1, t0; \ - VMOVDQU t0, (64+off)(dst); \ - VMOVDQU (96+off)(src), t0; \ - VPERM2I128 $49, v3, v2, t1; \ - VPXOR t0, t1, t0; \ - VMOVDQU t0, (96+off)(dst) - -#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \ - VMOVDQU (0+off)(src), t0; \ - VPERM2I128 $32, v1, v0, t1; \ - VPXOR t0, t1, t0; \ - VMOVDQU t0, (0+off)(dst); \ - VMOVDQU (32+off)(src), t0; \ - VPERM2I128 $32, v3, v2, t1; \ - VPXOR t0, t1, t0; \ - VMOVDQU t0, (32+off)(dst); \ - -#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \ - VPERM2I128 $49, v1, v0, t0; \ - VMOVDQU t0, 0(dst); \ - VPERM2I128 $49, v3, v2, t0; \ - VMOVDQU t0, 32(dst) diff --git a/vendor/github.com/aead/chacha20/chacha20.go b/vendor/github.com/aead/chacha20/chacha20.go deleted file mode 100644 index df6ddd2..0000000 --- a/vendor/github.com/aead/chacha20/chacha20.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. -// Use of this source code is governed by a license that can be -// found in the LICENSE file. - -// Package chacha20 implements the ChaCha20 / XChaCha20 stream chipher. -// Notice that one specific key-nonce combination must be unique for all time. -// -// There are three versions of ChaCha20: -// - ChaCha20 with a 64 bit nonce (en/decrypt up to 2^64 * 64 bytes for one key-nonce combination) -// - ChaCha20 with a 96 bit nonce (en/decrypt up to 2^32 * 64 bytes (~256 GB) for one key-nonce combination) -// - XChaCha20 with a 192 bit nonce (en/decrypt up to 2^64 * 64 bytes for one key-nonce combination) -package chacha20 // import "github.com/aead/chacha20" - -import ( - "crypto/cipher" - - "github.com/aead/chacha20/chacha" -) - -// XORKeyStream crypts bytes from src to dst using the given nonce and key. -// The length of the nonce determinds the version of ChaCha20: -// - 8 bytes: ChaCha20 with a 64 bit nonce and a 2^64 * 64 byte period. -// - 12 bytes: ChaCha20 as defined in RFC 7539 and a 2^32 * 64 byte period. -// - 24 bytes: XChaCha20 with a 192 bit nonce and a 2^64 * 64 byte period. -// Src and dst may be the same slice but otherwise should not overlap. -// If len(dst) < len(src) this function panics. -// If the nonce is neither 64, 96 nor 192 bits long, this function panics. -func XORKeyStream(dst, src, nonce, key []byte) { - chacha.XORKeyStream(dst, src, nonce, key, 20) -} - -// NewCipher returns a new cipher.Stream implementing a ChaCha20 version. -// The nonce must be unique for one key for all time. -// The length of the nonce determinds the version of ChaCha20: -// - 8 bytes: ChaCha20 with a 64 bit nonce and a 2^64 * 64 byte period. -// - 12 bytes: ChaCha20 as defined in RFC 7539 and a 2^32 * 64 byte period. -// - 24 bytes: XChaCha20 with a 192 bit nonce and a 2^64 * 64 byte period. -// If the nonce is neither 64, 96 nor 192 bits long, a non-nil error is returned. -func NewCipher(nonce, key []byte) (cipher.Stream, error) { - return chacha.NewCipher(nonce, key, 20) -} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go new file mode 100644 index 0000000..661ea13 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go @@ -0,0 +1,16 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego + +package chacha20 + +const bufSize = 256 + +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s new file mode 100644 index 0000000..7dd2638 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s @@ -0,0 +1,307 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego + +#include "textflag.h" + +#define NUM_ROUNDS 10 + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD dst+0(FP), R1 + MOVD src+24(FP), R2 + MOVD src_len+32(FP), R3 + MOVD key+48(FP), R4 + MOVD nonce+56(FP), R6 + MOVD counter+64(FP), R7 + + MOVD $·constants(SB), R10 + MOVD $·incRotMatrix(SB), R11 + + MOVW (R7), R20 + + AND $~255, R3, R13 + ADD R2, R13, R12 // R12 for block end + AND $255, R3, R13 +loop: + MOVD $NUM_ROUNDS, R21 + VLD1 (R11), [V30.S4, V31.S4] + + // load contants + // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] + WORD $0x4D60E940 + + // load keys + // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4] + WORD $0x4DFFE884 + // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4] + WORD $0x4DFFE888 + SUB $32, R4 + + // load counter + nonce + // VLD1R (R7), [V12.S4] + WORD $0x4D40C8EC + + // VLD3R (R6), [V13.S4, V14.S4, V15.S4] + WORD $0x4D40E8CD + + // update counter + VADD V30.S4, V12.S4, V12.S4 + +chacha: + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) + VADD V0.S4, V4.S4, V0.S4 + VADD V1.S4, V5.S4, V1.S4 + VADD V2.S4, V6.S4, V2.S4 + VADD V3.S4, V7.S4, V3.S4 + VEOR V12.B16, V0.B16, V12.B16 + VEOR V13.B16, V1.B16, V13.B16 + VEOR V14.B16, V2.B16, V14.B16 + VEOR V15.B16, V3.B16, V15.B16 + VREV32 V12.H8, V12.H8 + VREV32 V13.H8, V13.H8 + VREV32 V14.H8, V14.H8 + VREV32 V15.H8, V15.H8 + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) + VADD V8.S4, V12.S4, V8.S4 + VADD V9.S4, V13.S4, V9.S4 + VADD V10.S4, V14.S4, V10.S4 + VADD V11.S4, V15.S4, V11.S4 + VEOR V8.B16, V4.B16, V16.B16 + VEOR V9.B16, V5.B16, V17.B16 + VEOR V10.B16, V6.B16, V18.B16 + VEOR V11.B16, V7.B16, V19.B16 + VSHL $12, V16.S4, V4.S4 + VSHL $12, V17.S4, V5.S4 + VSHL $12, V18.S4, V6.S4 + VSHL $12, V19.S4, V7.S4 + VSRI $20, V16.S4, V4.S4 + VSRI $20, V17.S4, V5.S4 + VSRI $20, V18.S4, V6.S4 + VSRI $20, V19.S4, V7.S4 + + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) + VADD V0.S4, V4.S4, V0.S4 + VADD V1.S4, V5.S4, V1.S4 + VADD V2.S4, V6.S4, V2.S4 + VADD V3.S4, V7.S4, V3.S4 + VEOR V12.B16, V0.B16, V12.B16 + VEOR V13.B16, V1.B16, V13.B16 + VEOR V14.B16, V2.B16, V14.B16 + VEOR V15.B16, V3.B16, V15.B16 + VTBL V31.B16, [V12.B16], V12.B16 + VTBL V31.B16, [V13.B16], V13.B16 + VTBL V31.B16, [V14.B16], V14.B16 + VTBL V31.B16, [V15.B16], V15.B16 + + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) + VADD V12.S4, V8.S4, V8.S4 + VADD V13.S4, V9.S4, V9.S4 + VADD V14.S4, V10.S4, V10.S4 + VADD V15.S4, V11.S4, V11.S4 + VEOR V8.B16, V4.B16, V16.B16 + VEOR V9.B16, V5.B16, V17.B16 + VEOR V10.B16, V6.B16, V18.B16 + VEOR V11.B16, V7.B16, V19.B16 + VSHL $7, V16.S4, V4.S4 + VSHL $7, V17.S4, V5.S4 + VSHL $7, V18.S4, V6.S4 + VSHL $7, V19.S4, V7.S4 + VSRI $25, V16.S4, V4.S4 + VSRI $25, V17.S4, V5.S4 + VSRI $25, V18.S4, V6.S4 + VSRI $25, V19.S4, V7.S4 + + // V0..V3 += V5..V7, V4 + // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) + VADD V0.S4, V5.S4, V0.S4 + VADD V1.S4, V6.S4, V1.S4 + VADD V2.S4, V7.S4, V2.S4 + VADD V3.S4, V4.S4, V3.S4 + VEOR V15.B16, V0.B16, V15.B16 + VEOR V12.B16, V1.B16, V12.B16 + VEOR V13.B16, V2.B16, V13.B16 + VEOR V14.B16, V3.B16, V14.B16 + VREV32 V12.H8, V12.H8 + VREV32 V13.H8, V13.H8 + VREV32 V14.H8, V14.H8 + VREV32 V15.H8, V15.H8 + + // V10 += V15; V5 <<<= ((V10 XOR V5), 12) + // ... + VADD V15.S4, V10.S4, V10.S4 + VADD V12.S4, V11.S4, V11.S4 + VADD V13.S4, V8.S4, V8.S4 + VADD V14.S4, V9.S4, V9.S4 + VEOR V10.B16, V5.B16, V16.B16 + VEOR V11.B16, V6.B16, V17.B16 + VEOR V8.B16, V7.B16, V18.B16 + VEOR V9.B16, V4.B16, V19.B16 + VSHL $12, V16.S4, V5.S4 + VSHL $12, V17.S4, V6.S4 + VSHL $12, V18.S4, V7.S4 + VSHL $12, V19.S4, V4.S4 + VSRI $20, V16.S4, V5.S4 + VSRI $20, V17.S4, V6.S4 + VSRI $20, V18.S4, V7.S4 + VSRI $20, V19.S4, V4.S4 + + // V0 += V5; V15 <<<= ((V0 XOR V15), 8) + // ... + VADD V5.S4, V0.S4, V0.S4 + VADD V6.S4, V1.S4, V1.S4 + VADD V7.S4, V2.S4, V2.S4 + VADD V4.S4, V3.S4, V3.S4 + VEOR V0.B16, V15.B16, V15.B16 + VEOR V1.B16, V12.B16, V12.B16 + VEOR V2.B16, V13.B16, V13.B16 + VEOR V3.B16, V14.B16, V14.B16 + VTBL V31.B16, [V12.B16], V12.B16 + VTBL V31.B16, [V13.B16], V13.B16 + VTBL V31.B16, [V14.B16], V14.B16 + VTBL V31.B16, [V15.B16], V15.B16 + + // V10 += V15; V5 <<<= ((V10 XOR V5), 7) + // ... + VADD V15.S4, V10.S4, V10.S4 + VADD V12.S4, V11.S4, V11.S4 + VADD V13.S4, V8.S4, V8.S4 + VADD V14.S4, V9.S4, V9.S4 + VEOR V10.B16, V5.B16, V16.B16 + VEOR V11.B16, V6.B16, V17.B16 + VEOR V8.B16, V7.B16, V18.B16 + VEOR V9.B16, V4.B16, V19.B16 + VSHL $7, V16.S4, V5.S4 + VSHL $7, V17.S4, V6.S4 + VSHL $7, V18.S4, V7.S4 + VSHL $7, V19.S4, V4.S4 + VSRI $25, V16.S4, V5.S4 + VSRI $25, V17.S4, V6.S4 + VSRI $25, V18.S4, V7.S4 + VSRI $25, V19.S4, V4.S4 + + SUB $1, R21 + CBNZ R21, chacha + + // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4] + WORD $0x4D60E950 + + // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4] + WORD $0x4DFFE894 + VADD V30.S4, V12.S4, V12.S4 + VADD V16.S4, V0.S4, V0.S4 + VADD V17.S4, V1.S4, V1.S4 + VADD V18.S4, V2.S4, V2.S4 + VADD V19.S4, V3.S4, V3.S4 + // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4] + WORD $0x4DFFE898 + // restore R4 + SUB $32, R4 + + // load counter + nonce + // VLD1R (R7), [V28.S4] + WORD $0x4D40C8FC + // VLD3R (R6), [V29.S4, V30.S4, V31.S4] + WORD $0x4D40E8DD + + VADD V20.S4, V4.S4, V4.S4 + VADD V21.S4, V5.S4, V5.S4 + VADD V22.S4, V6.S4, V6.S4 + VADD V23.S4, V7.S4, V7.S4 + VADD V24.S4, V8.S4, V8.S4 + VADD V25.S4, V9.S4, V9.S4 + VADD V26.S4, V10.S4, V10.S4 + VADD V27.S4, V11.S4, V11.S4 + VADD V28.S4, V12.S4, V12.S4 + VADD V29.S4, V13.S4, V13.S4 + VADD V30.S4, V14.S4, V14.S4 + VADD V31.S4, V15.S4, V15.S4 + + VZIP1 V1.S4, V0.S4, V16.S4 + VZIP2 V1.S4, V0.S4, V17.S4 + VZIP1 V3.S4, V2.S4, V18.S4 + VZIP2 V3.S4, V2.S4, V19.S4 + VZIP1 V5.S4, V4.S4, V20.S4 + VZIP2 V5.S4, V4.S4, V21.S4 + VZIP1 V7.S4, V6.S4, V22.S4 + VZIP2 V7.S4, V6.S4, V23.S4 + VZIP1 V9.S4, V8.S4, V24.S4 + VZIP2 V9.S4, V8.S4, V25.S4 + VZIP1 V11.S4, V10.S4, V26.S4 + VZIP2 V11.S4, V10.S4, V27.S4 + VZIP1 V13.S4, V12.S4, V28.S4 + VZIP2 V13.S4, V12.S4, V29.S4 + VZIP1 V15.S4, V14.S4, V30.S4 + VZIP2 V15.S4, V14.S4, V31.S4 + VZIP1 V18.D2, V16.D2, V0.D2 + VZIP2 V18.D2, V16.D2, V4.D2 + VZIP1 V19.D2, V17.D2, V8.D2 + VZIP2 V19.D2, V17.D2, V12.D2 + VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16] + + VZIP1 V22.D2, V20.D2, V1.D2 + VZIP2 V22.D2, V20.D2, V5.D2 + VZIP1 V23.D2, V21.D2, V9.D2 + VZIP2 V23.D2, V21.D2, V13.D2 + VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16] + VZIP1 V26.D2, V24.D2, V2.D2 + VZIP2 V26.D2, V24.D2, V6.D2 + VZIP1 V27.D2, V25.D2, V10.D2 + VZIP2 V27.D2, V25.D2, V14.D2 + VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16] + VZIP1 V30.D2, V28.D2, V3.D2 + VZIP2 V30.D2, V28.D2, V7.D2 + VZIP1 V31.D2, V29.D2, V11.D2 + VZIP2 V31.D2, V29.D2, V15.D2 + VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16] + VEOR V0.B16, V16.B16, V16.B16 + VEOR V1.B16, V17.B16, V17.B16 + VEOR V2.B16, V18.B16, V18.B16 + VEOR V3.B16, V19.B16, V19.B16 + VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1) + VEOR V4.B16, V20.B16, V20.B16 + VEOR V5.B16, V21.B16, V21.B16 + VEOR V6.B16, V22.B16, V22.B16 + VEOR V7.B16, V23.B16, V23.B16 + VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1) + VEOR V8.B16, V24.B16, V24.B16 + VEOR V9.B16, V25.B16, V25.B16 + VEOR V10.B16, V26.B16, V26.B16 + VEOR V11.B16, V27.B16, V27.B16 + VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1) + VEOR V12.B16, V28.B16, V28.B16 + VEOR V13.B16, V29.B16, V29.B16 + VEOR V14.B16, V30.B16, V30.B16 + VEOR V15.B16, V31.B16, V31.B16 + VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1) + + ADD $4, R20 + MOVW R20, (R7) // update counter + + CMP R2, R12 + BGT loop + + RET + + +DATA ·constants+0x00(SB)/4, $0x61707865 +DATA ·constants+0x04(SB)/4, $0x3320646e +DATA ·constants+0x08(SB)/4, $0x79622d32 +DATA ·constants+0x0c(SB)/4, $0x6b206574 +GLOBL ·constants(SB), NOPTR|RODATA, $32 + +DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 +DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 +DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 +DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 +DATA ·incRotMatrix+0x10(SB)/4, $0x02010003 +DATA ·incRotMatrix+0x14(SB)/4, $0x06050407 +DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B +DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F +GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32 diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go new file mode 100644 index 0000000..93eb5ae --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go @@ -0,0 +1,398 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package chacha20 implements the ChaCha20 and XChaCha20 encryption algorithms +// as specified in RFC 8439 and draft-irtf-cfrg-xchacha-01. +package chacha20 + +import ( + "crypto/cipher" + "encoding/binary" + "errors" + "math/bits" + + "golang.org/x/crypto/internal/alias" +) + +const ( + // KeySize is the size of the key used by this cipher, in bytes. + KeySize = 32 + + // NonceSize is the size of the nonce used with the standard variant of this + // cipher, in bytes. + // + // Note that this is too short to be safely generated at random if the same + // key is reused more than 2³² times. + NonceSize = 12 + + // NonceSizeX is the size of the nonce used with the XChaCha20 variant of + // this cipher, in bytes. + NonceSizeX = 24 +) + +// Cipher is a stateful instance of ChaCha20 or XChaCha20 using a particular key +// and nonce. A *Cipher implements the cipher.Stream interface. +type Cipher struct { + // The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter + // (incremented after each block), and 3 of nonce. + key [8]uint32 + counter uint32 + nonce [3]uint32 + + // The last len bytes of buf are leftover key stream bytes from the previous + // XORKeyStream invocation. The size of buf depends on how many blocks are + // computed at a time by xorKeyStreamBlocks. + buf [bufSize]byte + len int + + // overflow is set when the counter overflowed, no more blocks can be + // generated, and the next XORKeyStream call should panic. + overflow bool + + // The counter-independent results of the first round are cached after they + // are computed the first time. + precompDone bool + p1, p5, p9, p13 uint32 + p2, p6, p10, p14 uint32 + p3, p7, p11, p15 uint32 +} + +var _ cipher.Stream = (*Cipher)(nil) + +// NewUnauthenticatedCipher creates a new ChaCha20 stream cipher with the given +// 32 bytes key and a 12 or 24 bytes nonce. If a nonce of 24 bytes is provided, +// the XChaCha20 construction will be used. It returns an error if key or nonce +// have any other length. +// +// Note that ChaCha20, like all stream ciphers, is not authenticated and allows +// attackers to silently tamper with the plaintext. For this reason, it is more +// appropriate as a building block than as a standalone encryption mechanism. +// Instead, consider using package golang.org/x/crypto/chacha20poly1305. +func NewUnauthenticatedCipher(key, nonce []byte) (*Cipher, error) { + // This function is split into a wrapper so that the Cipher allocation will + // be inlined, and depending on how the caller uses the return value, won't + // escape to the heap. + c := &Cipher{} + return newUnauthenticatedCipher(c, key, nonce) +} + +func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20: wrong key size") + } + if len(nonce) == NonceSizeX { + // XChaCha20 uses the ChaCha20 core to mix 16 bytes of the nonce into a + // derived key, allowing it to operate on a nonce of 24 bytes. See + // draft-irtf-cfrg-xchacha-01, Section 2.3. + key, _ = HChaCha20(key, nonce[0:16]) + cNonce := make([]byte, NonceSize) + copy(cNonce[4:12], nonce[16:24]) + nonce = cNonce + } else if len(nonce) != NonceSize { + return nil, errors.New("chacha20: wrong nonce size") + } + + key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint + c.key = [8]uint32{ + binary.LittleEndian.Uint32(key[0:4]), + binary.LittleEndian.Uint32(key[4:8]), + binary.LittleEndian.Uint32(key[8:12]), + binary.LittleEndian.Uint32(key[12:16]), + binary.LittleEndian.Uint32(key[16:20]), + binary.LittleEndian.Uint32(key[20:24]), + binary.LittleEndian.Uint32(key[24:28]), + binary.LittleEndian.Uint32(key[28:32]), + } + c.nonce = [3]uint32{ + binary.LittleEndian.Uint32(nonce[0:4]), + binary.LittleEndian.Uint32(nonce[4:8]), + binary.LittleEndian.Uint32(nonce[8:12]), + } + return c, nil +} + +// The constant first 4 words of the ChaCha20 state. +const ( + j0 uint32 = 0x61707865 // expa + j1 uint32 = 0x3320646e // nd 3 + j2 uint32 = 0x79622d32 // 2-by + j3 uint32 = 0x6b206574 // te k +) + +const blockSize = 64 + +// quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words. +// It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16 +// words each round, in columnar or diagonal groups of 4 at a time. +func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) { + a += b + d ^= a + d = bits.RotateLeft32(d, 16) + c += d + b ^= c + b = bits.RotateLeft32(b, 12) + a += b + d ^= a + d = bits.RotateLeft32(d, 8) + c += d + b ^= c + b = bits.RotateLeft32(b, 7) + return a, b, c, d +} + +// SetCounter sets the Cipher counter. The next invocation of XORKeyStream will +// behave as if (64 * counter) bytes had been encrypted so far. +// +// To prevent accidental counter reuse, SetCounter panics if counter is less +// than the current value. +// +// Note that the execution time of XORKeyStream is not independent of the +// counter value. +func (s *Cipher) SetCounter(counter uint32) { + // Internally, s may buffer multiple blocks, which complicates this + // implementation slightly. When checking whether the counter has rolled + // back, we must use both s.counter and s.len to determine how many blocks + // we have already output. + outputCounter := s.counter - uint32(s.len)/blockSize + if s.overflow || counter < outputCounter { + panic("chacha20: SetCounter attempted to rollback counter") + } + + // In the general case, we set the new counter value and reset s.len to 0, + // causing the next call to XORKeyStream to refill the buffer. However, if + // we're advancing within the existing buffer, we can save work by simply + // setting s.len. + if counter < s.counter { + s.len = int(s.counter-counter) * blockSize + } else { + s.counter = counter + s.len = 0 + } +} + +// XORKeyStream XORs each byte in the given slice with a byte from the +// cipher's key stream. Dst and src must overlap entirely or not at all. +// +// If len(dst) < len(src), XORKeyStream will panic. It is acceptable +// to pass a dst bigger than src, and in that case, XORKeyStream will +// only update dst[:len(src)] and will not touch the rest of dst. +// +// Multiple calls to XORKeyStream behave as if the concatenation of +// the src buffers was passed in a single run. That is, Cipher +// maintains state and does not reset at each XORKeyStream call. +func (s *Cipher) XORKeyStream(dst, src []byte) { + if len(src) == 0 { + return + } + if len(dst) < len(src) { + panic("chacha20: output smaller than input") + } + dst = dst[:len(src)] + if alias.InexactOverlap(dst, src) { + panic("chacha20: invalid buffer overlap") + } + + // First, drain any remaining key stream from a previous XORKeyStream. + if s.len != 0 { + keyStream := s.buf[bufSize-s.len:] + if len(src) < len(keyStream) { + keyStream = keyStream[:len(src)] + } + _ = src[len(keyStream)-1] // bounds check elimination hint + for i, b := range keyStream { + dst[i] = src[i] ^ b + } + s.len -= len(keyStream) + dst, src = dst[len(keyStream):], src[len(keyStream):] + } + if len(src) == 0 { + return + } + + // If we'd need to let the counter overflow and keep generating output, + // panic immediately. If instead we'd only reach the last block, remember + // not to generate any more output after the buffer is drained. + numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize + if s.overflow || uint64(s.counter)+numBlocks > 1<<32 { + panic("chacha20: counter overflow") + } else if uint64(s.counter)+numBlocks == 1<<32 { + s.overflow = true + } + + // xorKeyStreamBlocks implementations expect input lengths that are a + // multiple of bufSize. Platform-specific ones process multiple blocks at a + // time, so have bufSizes that are a multiple of blockSize. + + full := len(src) - len(src)%bufSize + if full > 0 { + s.xorKeyStreamBlocks(dst[:full], src[:full]) + } + dst, src = dst[full:], src[full:] + + // If using a multi-block xorKeyStreamBlocks would overflow, use the generic + // one that does one block at a time. + const blocksPerBuf = bufSize / blockSize + if uint64(s.counter)+blocksPerBuf > 1<<32 { + s.buf = [bufSize]byte{} + numBlocks := (len(src) + blockSize - 1) / blockSize + buf := s.buf[bufSize-numBlocks*blockSize:] + copy(buf, src) + s.xorKeyStreamBlocksGeneric(buf, buf) + s.len = len(buf) - copy(dst, buf) + return + } + + // If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and + // keep the leftover keystream for the next XORKeyStream invocation. + if len(src) > 0 { + s.buf = [bufSize]byte{} + copy(s.buf[:], src) + s.xorKeyStreamBlocks(s.buf[:], s.buf[:]) + s.len = bufSize - copy(dst, s.buf[:]) + } +} + +func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) { + if len(dst) != len(src) || len(dst)%blockSize != 0 { + panic("chacha20: internal error: wrong dst and/or src length") + } + + // To generate each block of key stream, the initial cipher state + // (represented below) is passed through 20 rounds of shuffling, + // alternatively applying quarterRounds by columns (like 1, 5, 9, 13) + // or by diagonals (like 1, 6, 11, 12). + // + // 0:cccccccc 1:cccccccc 2:cccccccc 3:cccccccc + // 4:kkkkkkkk 5:kkkkkkkk 6:kkkkkkkk 7:kkkkkkkk + // 8:kkkkkkkk 9:kkkkkkkk 10:kkkkkkkk 11:kkkkkkkk + // 12:bbbbbbbb 13:nnnnnnnn 14:nnnnnnnn 15:nnnnnnnn + // + // c=constant k=key b=blockcount n=nonce + var ( + c0, c1, c2, c3 = j0, j1, j2, j3 + c4, c5, c6, c7 = s.key[0], s.key[1], s.key[2], s.key[3] + c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7] + _, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2] + ) + + // Three quarters of the first round don't depend on the counter, so we can + // calculate them here, and reuse them for multiple blocks in the loop, and + // for future XORKeyStream invocations. + if !s.precompDone { + s.p1, s.p5, s.p9, s.p13 = quarterRound(c1, c5, c9, c13) + s.p2, s.p6, s.p10, s.p14 = quarterRound(c2, c6, c10, c14) + s.p3, s.p7, s.p11, s.p15 = quarterRound(c3, c7, c11, c15) + s.precompDone = true + } + + // A condition of len(src) > 0 would be sufficient, but this also + // acts as a bounds check elimination hint. + for len(src) >= 64 && len(dst) >= 64 { + // The remainder of the first column round. + fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter) + + // The second diagonal round. + x0, x5, x10, x15 := quarterRound(fcr0, s.p5, s.p10, s.p15) + x1, x6, x11, x12 := quarterRound(s.p1, s.p6, s.p11, fcr12) + x2, x7, x8, x13 := quarterRound(s.p2, s.p7, fcr8, s.p13) + x3, x4, x9, x14 := quarterRound(s.p3, fcr4, s.p9, s.p14) + + // The remaining 18 rounds. + for i := 0; i < 9; i++ { + // Column round. + x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12) + x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13) + x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14) + x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15) + + // Diagonal round. + x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15) + x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12) + x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13) + x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14) + } + + // Add back the initial state to generate the key stream, then + // XOR the key stream with the source and write out the result. + addXor(dst[0:4], src[0:4], x0, c0) + addXor(dst[4:8], src[4:8], x1, c1) + addXor(dst[8:12], src[8:12], x2, c2) + addXor(dst[12:16], src[12:16], x3, c3) + addXor(dst[16:20], src[16:20], x4, c4) + addXor(dst[20:24], src[20:24], x5, c5) + addXor(dst[24:28], src[24:28], x6, c6) + addXor(dst[28:32], src[28:32], x7, c7) + addXor(dst[32:36], src[32:36], x8, c8) + addXor(dst[36:40], src[36:40], x9, c9) + addXor(dst[40:44], src[40:44], x10, c10) + addXor(dst[44:48], src[44:48], x11, c11) + addXor(dst[48:52], src[48:52], x12, s.counter) + addXor(dst[52:56], src[52:56], x13, c13) + addXor(dst[56:60], src[56:60], x14, c14) + addXor(dst[60:64], src[60:64], x15, c15) + + s.counter += 1 + + src, dst = src[blockSize:], dst[blockSize:] + } +} + +// HChaCha20 uses the ChaCha20 core to generate a derived key from a 32 bytes +// key and a 16 bytes nonce. It returns an error if key or nonce have any other +// length. It is used as part of the XChaCha20 construction. +func HChaCha20(key, nonce []byte) ([]byte, error) { + // This function is split into a wrapper so that the slice allocation will + // be inlined, and depending on how the caller uses the return value, won't + // escape to the heap. + out := make([]byte, 32) + return hChaCha20(out, key, nonce) +} + +func hChaCha20(out, key, nonce []byte) ([]byte, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20: wrong HChaCha20 key size") + } + if len(nonce) != 16 { + return nil, errors.New("chacha20: wrong HChaCha20 nonce size") + } + + x0, x1, x2, x3 := j0, j1, j2, j3 + x4 := binary.LittleEndian.Uint32(key[0:4]) + x5 := binary.LittleEndian.Uint32(key[4:8]) + x6 := binary.LittleEndian.Uint32(key[8:12]) + x7 := binary.LittleEndian.Uint32(key[12:16]) + x8 := binary.LittleEndian.Uint32(key[16:20]) + x9 := binary.LittleEndian.Uint32(key[20:24]) + x10 := binary.LittleEndian.Uint32(key[24:28]) + x11 := binary.LittleEndian.Uint32(key[28:32]) + x12 := binary.LittleEndian.Uint32(nonce[0:4]) + x13 := binary.LittleEndian.Uint32(nonce[4:8]) + x14 := binary.LittleEndian.Uint32(nonce[8:12]) + x15 := binary.LittleEndian.Uint32(nonce[12:16]) + + for i := 0; i < 10; i++ { + // Diagonal round. + x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12) + x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13) + x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14) + x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15) + + // Column round. + x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15) + x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12) + x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13) + x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14) + } + + _ = out[31] // bounds check elimination hint + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x12) + binary.LittleEndian.PutUint32(out[20:24], x13) + binary.LittleEndian.PutUint32(out[24:28], x14) + binary.LittleEndian.PutUint32(out[28:32], x15) + return out, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go new file mode 100644 index 0000000..db42e66 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go @@ -0,0 +1,13 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego + +package chacha20 + +const bufSize = blockSize + +func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) { + s.xorKeyStreamBlocksGeneric(dst, src) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go new file mode 100644 index 0000000..3a4287f --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go @@ -0,0 +1,16 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego + +package chacha20 + +const bufSize = 256 + +//go:noescape +func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s new file mode 100644 index 0000000..66aebae --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s @@ -0,0 +1,449 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Based on CRYPTOGAMS code with the following comment: +// # ==================================================================== +// # Written by Andy Polyakov for the OpenSSL +// # project. The module is, however, dual licensed under OpenSSL and +// # CRYPTOGAMS licenses depending on where you obtain it. For further +// # details see http://www.openssl.org/~appro/cryptogams/. +// # ==================================================================== + +// Code for the perl script that generates the ppc64 assembler +// can be found in the cryptogams repository at the link below. It is based on +// the original from openssl. + +// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91 + +// The differences in this and the original implementation are +// due to the calling conventions and initialization of constants. + +//go:build gc && !purego + +#include "textflag.h" + +#define OUT R3 +#define INP R4 +#define LEN R5 +#define KEY R6 +#define CNT R7 +#define TMP R15 + +#define CONSTBASE R16 +#define BLOCKS R17 + +DATA consts<>+0x00(SB)/8, $0x3320646e61707865 +DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 +DATA consts<>+0x10(SB)/8, $0x0000000000000001 +DATA consts<>+0x18(SB)/8, $0x0000000000000000 +DATA consts<>+0x20(SB)/8, $0x0000000000000004 +DATA consts<>+0x28(SB)/8, $0x0000000000000000 +DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d +DATA consts<>+0x38(SB)/8, $0x0203000106070405 +DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c +DATA consts<>+0x48(SB)/8, $0x0102030005060704 +DATA consts<>+0x50(SB)/8, $0x6170786561707865 +DATA consts<>+0x58(SB)/8, $0x6170786561707865 +DATA consts<>+0x60(SB)/8, $0x3320646e3320646e +DATA consts<>+0x68(SB)/8, $0x3320646e3320646e +DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 +DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 +DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 +DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 +DATA consts<>+0x90(SB)/8, $0x0000000100000000 +DATA consts<>+0x98(SB)/8, $0x0000000300000002 +GLOBL consts<>(SB), RODATA, $0xa0 + +//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) +TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 + MOVD out+0(FP), OUT + MOVD inp+8(FP), INP + MOVD len+16(FP), LEN + MOVD key+24(FP), KEY + MOVD counter+32(FP), CNT + + // Addressing for constants + MOVD $consts<>+0x00(SB), CONSTBASE + MOVD $16, R8 + MOVD $32, R9 + MOVD $48, R10 + MOVD $64, R11 + SRD $6, LEN, BLOCKS + // V16 + LXVW4X (CONSTBASE)(R0), VS48 + ADD $80,CONSTBASE + + // Load key into V17,V18 + LXVW4X (KEY)(R0), VS49 + LXVW4X (KEY)(R8), VS50 + + // Load CNT, NONCE into V19 + LXVW4X (CNT)(R0), VS51 + + // Clear V27 + VXOR V27, V27, V27 + + // V28 + LXVW4X (CONSTBASE)(R11), VS60 + + // splat slot from V19 -> V26 + VSPLTW $0, V19, V26 + + VSLDOI $4, V19, V27, V19 + VSLDOI $12, V27, V19, V19 + + VADDUWM V26, V28, V26 + + MOVD $10, R14 + MOVD R14, CTR + +loop_outer_vsx: + // V0, V1, V2, V3 + LXVW4X (R0)(CONSTBASE), VS32 + LXVW4X (R8)(CONSTBASE), VS33 + LXVW4X (R9)(CONSTBASE), VS34 + LXVW4X (R10)(CONSTBASE), VS35 + + // splat values from V17, V18 into V4-V11 + VSPLTW $0, V17, V4 + VSPLTW $1, V17, V5 + VSPLTW $2, V17, V6 + VSPLTW $3, V17, V7 + VSPLTW $0, V18, V8 + VSPLTW $1, V18, V9 + VSPLTW $2, V18, V10 + VSPLTW $3, V18, V11 + + // VOR + VOR V26, V26, V12 + + // splat values from V19 -> V13, V14, V15 + VSPLTW $1, V19, V13 + VSPLTW $2, V19, V14 + VSPLTW $3, V19, V15 + + // splat const values + VSPLTISW $-16, V27 + VSPLTISW $12, V28 + VSPLTISW $8, V29 + VSPLTISW $7, V30 + +loop_vsx: + VADDUWM V0, V4, V0 + VADDUWM V1, V5, V1 + VADDUWM V2, V6, V2 + VADDUWM V3, V7, V3 + + VXOR V12, V0, V12 + VXOR V13, V1, V13 + VXOR V14, V2, V14 + VXOR V15, V3, V15 + + VRLW V12, V27, V12 + VRLW V13, V27, V13 + VRLW V14, V27, V14 + VRLW V15, V27, V15 + + VADDUWM V8, V12, V8 + VADDUWM V9, V13, V9 + VADDUWM V10, V14, V10 + VADDUWM V11, V15, V11 + + VXOR V4, V8, V4 + VXOR V5, V9, V5 + VXOR V6, V10, V6 + VXOR V7, V11, V7 + + VRLW V4, V28, V4 + VRLW V5, V28, V5 + VRLW V6, V28, V6 + VRLW V7, V28, V7 + + VADDUWM V0, V4, V0 + VADDUWM V1, V5, V1 + VADDUWM V2, V6, V2 + VADDUWM V3, V7, V3 + + VXOR V12, V0, V12 + VXOR V13, V1, V13 + VXOR V14, V2, V14 + VXOR V15, V3, V15 + + VRLW V12, V29, V12 + VRLW V13, V29, V13 + VRLW V14, V29, V14 + VRLW V15, V29, V15 + + VADDUWM V8, V12, V8 + VADDUWM V9, V13, V9 + VADDUWM V10, V14, V10 + VADDUWM V11, V15, V11 + + VXOR V4, V8, V4 + VXOR V5, V9, V5 + VXOR V6, V10, V6 + VXOR V7, V11, V7 + + VRLW V4, V30, V4 + VRLW V5, V30, V5 + VRLW V6, V30, V6 + VRLW V7, V30, V7 + + VADDUWM V0, V5, V0 + VADDUWM V1, V6, V1 + VADDUWM V2, V7, V2 + VADDUWM V3, V4, V3 + + VXOR V15, V0, V15 + VXOR V12, V1, V12 + VXOR V13, V2, V13 + VXOR V14, V3, V14 + + VRLW V15, V27, V15 + VRLW V12, V27, V12 + VRLW V13, V27, V13 + VRLW V14, V27, V14 + + VADDUWM V10, V15, V10 + VADDUWM V11, V12, V11 + VADDUWM V8, V13, V8 + VADDUWM V9, V14, V9 + + VXOR V5, V10, V5 + VXOR V6, V11, V6 + VXOR V7, V8, V7 + VXOR V4, V9, V4 + + VRLW V5, V28, V5 + VRLW V6, V28, V6 + VRLW V7, V28, V7 + VRLW V4, V28, V4 + + VADDUWM V0, V5, V0 + VADDUWM V1, V6, V1 + VADDUWM V2, V7, V2 + VADDUWM V3, V4, V3 + + VXOR V15, V0, V15 + VXOR V12, V1, V12 + VXOR V13, V2, V13 + VXOR V14, V3, V14 + + VRLW V15, V29, V15 + VRLW V12, V29, V12 + VRLW V13, V29, V13 + VRLW V14, V29, V14 + + VADDUWM V10, V15, V10 + VADDUWM V11, V12, V11 + VADDUWM V8, V13, V8 + VADDUWM V9, V14, V9 + + VXOR V5, V10, V5 + VXOR V6, V11, V6 + VXOR V7, V8, V7 + VXOR V4, V9, V4 + + VRLW V5, V30, V5 + VRLW V6, V30, V6 + VRLW V7, V30, V7 + VRLW V4, V30, V4 + BC 16, LT, loop_vsx + + VADDUWM V12, V26, V12 + + WORD $0x13600F8C // VMRGEW V0, V1, V27 + WORD $0x13821F8C // VMRGEW V2, V3, V28 + + WORD $0x10000E8C // VMRGOW V0, V1, V0 + WORD $0x10421E8C // VMRGOW V2, V3, V2 + + WORD $0x13A42F8C // VMRGEW V4, V5, V29 + WORD $0x13C63F8C // VMRGEW V6, V7, V30 + + XXPERMDI VS32, VS34, $0, VS33 + XXPERMDI VS32, VS34, $3, VS35 + XXPERMDI VS59, VS60, $0, VS32 + XXPERMDI VS59, VS60, $3, VS34 + + WORD $0x10842E8C // VMRGOW V4, V5, V4 + WORD $0x10C63E8C // VMRGOW V6, V7, V6 + + WORD $0x13684F8C // VMRGEW V8, V9, V27 + WORD $0x138A5F8C // VMRGEW V10, V11, V28 + + XXPERMDI VS36, VS38, $0, VS37 + XXPERMDI VS36, VS38, $3, VS39 + XXPERMDI VS61, VS62, $0, VS36 + XXPERMDI VS61, VS62, $3, VS38 + + WORD $0x11084E8C // VMRGOW V8, V9, V8 + WORD $0x114A5E8C // VMRGOW V10, V11, V10 + + WORD $0x13AC6F8C // VMRGEW V12, V13, V29 + WORD $0x13CE7F8C // VMRGEW V14, V15, V30 + + XXPERMDI VS40, VS42, $0, VS41 + XXPERMDI VS40, VS42, $3, VS43 + XXPERMDI VS59, VS60, $0, VS40 + XXPERMDI VS59, VS60, $3, VS42 + + WORD $0x118C6E8C // VMRGOW V12, V13, V12 + WORD $0x11CE7E8C // VMRGOW V14, V15, V14 + + VSPLTISW $4, V27 + VADDUWM V26, V27, V26 + + XXPERMDI VS44, VS46, $0, VS45 + XXPERMDI VS44, VS46, $3, VS47 + XXPERMDI VS61, VS62, $0, VS44 + XXPERMDI VS61, VS62, $3, VS46 + + VADDUWM V0, V16, V0 + VADDUWM V4, V17, V4 + VADDUWM V8, V18, V8 + VADDUWM V12, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + // Bottom of loop + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V1, V16, V0 + VADDUWM V5, V17, V4 + VADDUWM V9, V18, V8 + VADDUWM V13, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + VXOR V27, V0, V27 + + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(V10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V2, V16, V0 + VADDUWM V6, V17, V4 + VADDUWM V10, V18, V8 + VADDUWM V14, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V3, V16, V0 + VADDUWM V7, V17, V4 + VADDUWM V11, V18, V8 + VADDUWM V15, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + + MOVD $10, R14 + MOVD R14, CTR + BNE loop_outer_vsx + +done_vsx: + // Increment counter by number of 64 byte blocks + MOVD (CNT), R14 + ADD BLOCKS, R14 + MOVD R14, (CNT) + RET + +tail_vsx: + ADD $32, R1, R11 + MOVD LEN, CTR + + // Save values on stack to copy from + STXVW4X VS32, (R11)(R0) + STXVW4X VS36, (R11)(R8) + STXVW4X VS40, (R11)(R9) + STXVW4X VS44, (R11)(R10) + ADD $-1, R11, R12 + ADD $-1, INP + ADD $-1, OUT + +looptail_vsx: + // Copying the result to OUT + // in bytes. + MOVBZU 1(R12), KEY + MOVBZU 1(INP), TMP + XOR KEY, TMP, KEY + MOVBU KEY, 1(OUT) + BC 16, LT, looptail_vsx + + // Clear the stack values + STXVW4X VS48, (R11)(R0) + STXVW4X VS48, (R11)(R8) + STXVW4X VS48, (R11)(R9) + STXVW4X VS48, (R11)(R10) + BR done_vsx diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go new file mode 100644 index 0000000..683ccfd --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go @@ -0,0 +1,27 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego + +package chacha20 + +import "golang.org/x/sys/cpu" + +var haveAsm = cpu.S390X.HasVX + +const bufSize = 256 + +// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only +// be called when the vector facility is available. Implementation in asm_s390x.s. +// +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + if cpu.S390X.HasVX { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) + } else { + c.xorKeyStreamBlocksGeneric(dst, src) + } +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s new file mode 100644 index 0000000..1eda91a --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s @@ -0,0 +1,224 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego + +#include "go_asm.h" +#include "textflag.h" + +// This is an implementation of the ChaCha20 encryption algorithm as +// specified in RFC 7539. It uses vector instructions to compute +// 4 keystream blocks in parallel (256 bytes) which are then XORed +// with the bytes in the input slice. + +GLOBL ·constants<>(SB), RODATA|NOPTR, $32 +// BSWAP: swap bytes in each 4-byte element +DATA ·constants<>+0x00(SB)/4, $0x03020100 +DATA ·constants<>+0x04(SB)/4, $0x07060504 +DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 +DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c +// J0: [j0, j1, j2, j3] +DATA ·constants<>+0x10(SB)/4, $0x61707865 +DATA ·constants<>+0x14(SB)/4, $0x3320646e +DATA ·constants<>+0x18(SB)/4, $0x79622d32 +DATA ·constants<>+0x1c(SB)/4, $0x6b206574 + +#define BSWAP V5 +#define J0 V6 +#define KEY0 V7 +#define KEY1 V8 +#define NONCE V9 +#define CTR V10 +#define M0 V11 +#define M1 V12 +#define M2 V13 +#define M3 V14 +#define INC V15 +#define X0 V16 +#define X1 V17 +#define X2 V18 +#define X3 V19 +#define X4 V20 +#define X5 V21 +#define X6 V22 +#define X7 V23 +#define X8 V24 +#define X9 V25 +#define X10 V26 +#define X11 V27 +#define X12 V28 +#define X13 V29 +#define X14 V30 +#define X15 V31 + +#define NUM_ROUNDS 20 + +#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $16, a2, a2 \ + VERLLF $16, b2, b2 \ + VERLLF $16, c2, c2 \ + VERLLF $16, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $12, a1, a1 \ + VERLLF $12, b1, b1 \ + VERLLF $12, c1, c1 \ + VERLLF $12, d1, d1 \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $8, a2, a2 \ + VERLLF $8, b2, b2 \ + VERLLF $8, c2, c2 \ + VERLLF $8, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $7, a1, a1 \ + VERLLF $7, b1, b1 \ + VERLLF $7, c1, c1 \ + VERLLF $7, d1, d1 + +#define PERMUTE(mask, v0, v1, v2, v3) \ + VPERM v0, v0, mask, v0 \ + VPERM v1, v1, mask, v1 \ + VPERM v2, v2, mask, v2 \ + VPERM v3, v3, mask, v3 + +#define ADDV(x, v0, v1, v2, v3) \ + VAF x, v0, v0 \ + VAF x, v1, v1 \ + VAF x, v2, v2 \ + VAF x, v3, v3 + +#define XORV(off, dst, src, v0, v1, v2, v3) \ + VLM off(src), M0, M3 \ + PERMUTE(BSWAP, v0, v1, v2, v3) \ + VX v0, M0, M0 \ + VX v1, M1, M1 \ + VX v2, M2, M2 \ + VX v3, M3, M3 \ + VSTM M0, M3, off(dst) + +#define SHUFFLE(a, b, c, d, t, u, v, w) \ + VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} + VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} + VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} + VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} + VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} + VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} + VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} + VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD $·constants<>(SB), R1 + MOVD dst+0(FP), R2 // R2=&dst[0] + LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) + MOVD key+48(FP), R5 // R5=key + MOVD nonce+56(FP), R6 // R6=nonce + MOVD counter+64(FP), R7 // R7=counter + + // load BSWAP and J0 + VLM (R1), BSWAP, J0 + + // setup + MOVD $95, R0 + VLM (R5), KEY0, KEY1 + VLL R0, (R6), NONCE + VZERO M0 + VLEIB $7, $32, M0 + VSRLB M0, NONCE, NONCE + + // initialize counter values + VLREPF (R7), CTR + VZERO INC + VLEIF $1, $1, INC + VLEIF $2, $2, INC + VLEIF $3, $3, INC + VAF INC, CTR, CTR + VREPIF $4, INC + +chacha: + VREPF $0, J0, X0 + VREPF $1, J0, X1 + VREPF $2, J0, X2 + VREPF $3, J0, X3 + VREPF $0, KEY0, X4 + VREPF $1, KEY0, X5 + VREPF $2, KEY0, X6 + VREPF $3, KEY0, X7 + VREPF $0, KEY1, X8 + VREPF $1, KEY1, X9 + VREPF $2, KEY1, X10 + VREPF $3, KEY1, X11 + VLR CTR, X12 + VREPF $1, NONCE, X13 + VREPF $2, NONCE, X14 + VREPF $3, NONCE, X15 + + MOVD $(NUM_ROUNDS/2), R1 + +loop: + ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) + ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) + + ADD $-1, R1 + BNE loop + + // decrement length + ADD $-256, R4 + + // rearrange vectors + SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) + ADDV(J0, X0, X1, X2, X3) + SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) + ADDV(KEY0, X4, X5, X6, X7) + SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) + ADDV(KEY1, X8, X9, X10, X11) + VAF CTR, X12, X12 + SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) + ADDV(NONCE, X12, X13, X14, X15) + + // increment counters + VAF INC, CTR, CTR + + // xor keystream with plaintext + XORV(0*64, R2, R3, X0, X4, X8, X12) + XORV(1*64, R2, R3, X1, X5, X9, X13) + XORV(2*64, R2, R3, X2, X6, X10, X14) + XORV(3*64, R2, R3, X3, X7, X11, X15) + + // increment pointers + MOVD $256(R2), R2 + MOVD $256(R3), R3 + + CMPBNE R4, $0, chacha + + VSTEF $0, CTR, (R7) + RET diff --git a/vendor/golang.org/x/crypto/chacha20/xor.go b/vendor/golang.org/x/crypto/chacha20/xor.go new file mode 100644 index 0000000..c2d0485 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/xor.go @@ -0,0 +1,42 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found src the LICENSE file. + +package chacha20 + +import "runtime" + +// Platforms that have fast unaligned 32-bit little endian accesses. +const unaligned = runtime.GOARCH == "386" || + runtime.GOARCH == "amd64" || + runtime.GOARCH == "arm64" || + runtime.GOARCH == "ppc64le" || + runtime.GOARCH == "s390x" + +// addXor reads a little endian uint32 from src, XORs it with (a + b) and +// places the result in little endian byte order in dst. +func addXor(dst, src []byte, a, b uint32) { + _, _ = src[3], dst[3] // bounds check elimination hint + if unaligned { + // The compiler should optimize this code into + // 32-bit unaligned little endian loads and stores. + // TODO: delete once the compiler does a reliably + // good job with the generic code below. + // See issue #25111 for more details. + v := uint32(src[0]) + v |= uint32(src[1]) << 8 + v |= uint32(src[2]) << 16 + v |= uint32(src[3]) << 24 + v ^= a + b + dst[0] = byte(v) + dst[1] = byte(v >> 8) + dst[2] = byte(v >> 16) + dst[3] = byte(v >> 24) + } else { + a += b + dst[0] = src[0] ^ byte(a) + dst[1] = src[1] ^ byte(a>>8) + dst[2] = src[2] ^ byte(a>>16) + dst[3] = src[3] ^ byte(a>>24) + } +} diff --git a/vendor/golang.org/x/crypto/internal/alias/alias.go b/vendor/golang.org/x/crypto/internal/alias/alias.go new file mode 100644 index 0000000..551ff0c --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/alias/alias.go @@ -0,0 +1,31 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego + +// Package alias implements memory aliasing tests. +package alias + +import "unsafe" + +// AnyOverlap reports whether x and y share memory at any (not necessarily +// corresponding) index. The memory beyond the slice length is ignored. +func AnyOverlap(x, y []byte) bool { + return len(x) > 0 && len(y) > 0 && + uintptr(unsafe.Pointer(&x[0])) <= uintptr(unsafe.Pointer(&y[len(y)-1])) && + uintptr(unsafe.Pointer(&y[0])) <= uintptr(unsafe.Pointer(&x[len(x)-1])) +} + +// InexactOverlap reports whether x and y share memory at any non-corresponding +// index. The memory beyond the slice length is ignored. Note that x and y can +// have different lengths and still not have any inexact overlap. +// +// InexactOverlap can be used to implement the requirements of the crypto/cipher +// AEAD, Block, BlockMode and Stream interfaces. +func InexactOverlap(x, y []byte) bool { + if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] { + return false + } + return AnyOverlap(x, y) +} diff --git a/vendor/golang.org/x/crypto/internal/alias/alias_purego.go b/vendor/golang.org/x/crypto/internal/alias/alias_purego.go new file mode 100644 index 0000000..6fe61b5 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/alias/alias_purego.go @@ -0,0 +1,34 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build purego + +// Package alias implements memory aliasing tests. +package alias + +// This is the Google App Engine standard variant based on reflect +// because the unsafe package and cgo are disallowed. + +import "reflect" + +// AnyOverlap reports whether x and y share memory at any (not necessarily +// corresponding) index. The memory beyond the slice length is ignored. +func AnyOverlap(x, y []byte) bool { + return len(x) > 0 && len(y) > 0 && + reflect.ValueOf(&x[0]).Pointer() <= reflect.ValueOf(&y[len(y)-1]).Pointer() && + reflect.ValueOf(&y[0]).Pointer() <= reflect.ValueOf(&x[len(x)-1]).Pointer() +} + +// InexactOverlap reports whether x and y share memory at any non-corresponding +// index. The memory beyond the slice length is ignored. Note that x and y can +// have different lengths and still not have any inexact overlap. +// +// InexactOverlap can be used to implement the requirements of the crypto/cipher +// AEAD, Block, BlockMode and Stream interfaces. +func InexactOverlap(x, y []byte) bool { + if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] { + return false + } + return AnyOverlap(x, y) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 4397591..5d22c62 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,10 +1,6 @@ # github.com/aead/argon2 v0.0.0-20180111183520-a87724528b07 ## explicit github.com/aead/argon2 -# github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da -## explicit -github.com/aead/chacha20 -github.com/aead/chacha20/chacha # github.com/davecgh/go-spew v1.1.1 ## explicit github.com/davecgh/go-spew/spew @@ -25,6 +21,8 @@ github.com/stretchr/testify/assert # golang.org/x/crypto v0.17.0 ## explicit; go 1.18 golang.org/x/crypto/blake2b +golang.org/x/crypto/chacha20 +golang.org/x/crypto/internal/alias # golang.org/x/exp v0.0.0-20230105202349-8879d0199aa3 ## explicit; go 1.18 golang.org/x/exp/constraints