forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
crypto: blake2s - x86_64 SIMD implementation
These implementations from Samuel Neves support AVX and AVX-512VL. Originally this used AVX-512F, but Skylake thermal throttling made AVX-512VL more attractive and possible to do with negligable difference. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Samuel Neves <sneves@dei.uc.pt> Co-developed-by: Samuel Neves <sneves@dei.uc.pt> [ardb: move to arch/x86/crypto, wire into lib/crypto framework] Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
- Loading branch information
Showing
4 changed files
with
499 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,258 @@ | ||
/* SPDX-License-Identifier: GPL-2.0 OR MIT */ | ||
/* | ||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. | ||
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. | ||
*/ | ||
|
||
#include <linux/linkage.h> | ||
|
||
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 | ||
.align 32 | ||
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 | ||
.octa 0x5BE0CD191F83D9AB9B05688C510E527F | ||
.section .rodata.cst16.ROT16, "aM", @progbits, 16 | ||
.align 16 | ||
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 | ||
.section .rodata.cst16.ROR328, "aM", @progbits, 16 | ||
.align 16 | ||
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 | ||
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 | ||
.align 64 | ||
SIGMA: | ||
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 | ||
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 | ||
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 | ||
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 | ||
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 | ||
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 | ||
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 | ||
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 | ||
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 | ||
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 | ||
#ifdef CONFIG_AS_AVX512 | ||
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 | ||
.align 64 | ||
SIGMA2: | ||
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 | ||
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 | ||
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 | ||
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 | ||
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 | ||
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 | ||
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 | ||
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 | ||
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 | ||
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 | ||
#endif /* CONFIG_AS_AVX512 */ | ||
|
||
.text | ||
#ifdef CONFIG_AS_SSSE3 | ||
ENTRY(blake2s_compress_ssse3) | ||
testq %rdx,%rdx | ||
je .Lendofloop | ||
movdqu (%rdi),%xmm0 | ||
movdqu 0x10(%rdi),%xmm1 | ||
movdqa ROT16(%rip),%xmm12 | ||
movdqa ROR328(%rip),%xmm13 | ||
movdqu 0x20(%rdi),%xmm14 | ||
movq %rcx,%xmm15 | ||
leaq SIGMA+0xa0(%rip),%r8 | ||
jmp .Lbeginofloop | ||
.align 32 | ||
.Lbeginofloop: | ||
movdqa %xmm0,%xmm10 | ||
movdqa %xmm1,%xmm11 | ||
paddq %xmm15,%xmm14 | ||
movdqa IV(%rip),%xmm2 | ||
movdqa %xmm14,%xmm3 | ||
pxor IV+0x10(%rip),%xmm3 | ||
leaq SIGMA(%rip),%rcx | ||
.Lroundloop: | ||
movzbl (%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm4 | ||
movzbl 0x1(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm5 | ||
movzbl 0x2(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm6 | ||
movzbl 0x3(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm7 | ||
punpckldq %xmm5,%xmm4 | ||
punpckldq %xmm7,%xmm6 | ||
punpcklqdq %xmm6,%xmm4 | ||
paddd %xmm4,%xmm0 | ||
paddd %xmm1,%xmm0 | ||
pxor %xmm0,%xmm3 | ||
pshufb %xmm12,%xmm3 | ||
paddd %xmm3,%xmm2 | ||
pxor %xmm2,%xmm1 | ||
movdqa %xmm1,%xmm8 | ||
psrld $0xc,%xmm1 | ||
pslld $0x14,%xmm8 | ||
por %xmm8,%xmm1 | ||
movzbl 0x4(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm5 | ||
movzbl 0x5(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm6 | ||
movzbl 0x6(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm7 | ||
movzbl 0x7(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm4 | ||
punpckldq %xmm6,%xmm5 | ||
punpckldq %xmm4,%xmm7 | ||
punpcklqdq %xmm7,%xmm5 | ||
paddd %xmm5,%xmm0 | ||
paddd %xmm1,%xmm0 | ||
pxor %xmm0,%xmm3 | ||
pshufb %xmm13,%xmm3 | ||
paddd %xmm3,%xmm2 | ||
pxor %xmm2,%xmm1 | ||
movdqa %xmm1,%xmm8 | ||
psrld $0x7,%xmm1 | ||
pslld $0x19,%xmm8 | ||
por %xmm8,%xmm1 | ||
pshufd $0x93,%xmm0,%xmm0 | ||
pshufd $0x4e,%xmm3,%xmm3 | ||
pshufd $0x39,%xmm2,%xmm2 | ||
movzbl 0x8(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm6 | ||
movzbl 0x9(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm7 | ||
movzbl 0xa(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm4 | ||
movzbl 0xb(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm5 | ||
punpckldq %xmm7,%xmm6 | ||
punpckldq %xmm5,%xmm4 | ||
punpcklqdq %xmm4,%xmm6 | ||
paddd %xmm6,%xmm0 | ||
paddd %xmm1,%xmm0 | ||
pxor %xmm0,%xmm3 | ||
pshufb %xmm12,%xmm3 | ||
paddd %xmm3,%xmm2 | ||
pxor %xmm2,%xmm1 | ||
movdqa %xmm1,%xmm8 | ||
psrld $0xc,%xmm1 | ||
pslld $0x14,%xmm8 | ||
por %xmm8,%xmm1 | ||
movzbl 0xc(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm7 | ||
movzbl 0xd(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm4 | ||
movzbl 0xe(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm5 | ||
movzbl 0xf(%rcx),%eax | ||
movd (%rsi,%rax,4),%xmm6 | ||
punpckldq %xmm4,%xmm7 | ||
punpckldq %xmm6,%xmm5 | ||
punpcklqdq %xmm5,%xmm7 | ||
paddd %xmm7,%xmm0 | ||
paddd %xmm1,%xmm0 | ||
pxor %xmm0,%xmm3 | ||
pshufb %xmm13,%xmm3 | ||
paddd %xmm3,%xmm2 | ||
pxor %xmm2,%xmm1 | ||
movdqa %xmm1,%xmm8 | ||
psrld $0x7,%xmm1 | ||
pslld $0x19,%xmm8 | ||
por %xmm8,%xmm1 | ||
pshufd $0x39,%xmm0,%xmm0 | ||
pshufd $0x4e,%xmm3,%xmm3 | ||
pshufd $0x93,%xmm2,%xmm2 | ||
addq $0x10,%rcx | ||
cmpq %r8,%rcx | ||
jnz .Lroundloop | ||
pxor %xmm2,%xmm0 | ||
pxor %xmm3,%xmm1 | ||
pxor %xmm10,%xmm0 | ||
pxor %xmm11,%xmm1 | ||
addq $0x40,%rsi | ||
decq %rdx | ||
jnz .Lbeginofloop | ||
movdqu %xmm0,(%rdi) | ||
movdqu %xmm1,0x10(%rdi) | ||
movdqu %xmm14,0x20(%rdi) | ||
.Lendofloop: | ||
ret | ||
ENDPROC(blake2s_compress_ssse3) | ||
#endif /* CONFIG_AS_SSSE3 */ | ||
|
||
#ifdef CONFIG_AS_AVX512 | ||
ENTRY(blake2s_compress_avx512) | ||
vmovdqu (%rdi),%xmm0 | ||
vmovdqu 0x10(%rdi),%xmm1 | ||
vmovdqu 0x20(%rdi),%xmm4 | ||
vmovq %rcx,%xmm5 | ||
vmovdqa IV(%rip),%xmm14 | ||
vmovdqa IV+16(%rip),%xmm15 | ||
jmp .Lblake2s_compress_avx512_mainloop | ||
.align 32 | ||
.Lblake2s_compress_avx512_mainloop: | ||
vmovdqa %xmm0,%xmm10 | ||
vmovdqa %xmm1,%xmm11 | ||
vpaddq %xmm5,%xmm4,%xmm4 | ||
vmovdqa %xmm14,%xmm2 | ||
vpxor %xmm15,%xmm4,%xmm3 | ||
vmovdqu (%rsi),%ymm6 | ||
vmovdqu 0x20(%rsi),%ymm7 | ||
addq $0x40,%rsi | ||
leaq SIGMA2(%rip),%rax | ||
movb $0xa,%cl | ||
.Lblake2s_compress_avx512_roundloop: | ||
addq $0x40,%rax | ||
vmovdqa -0x40(%rax),%ymm8 | ||
vmovdqa -0x20(%rax),%ymm9 | ||
vpermi2d %ymm7,%ymm6,%ymm8 | ||
vpermi2d %ymm7,%ymm6,%ymm9 | ||
vmovdqa %ymm8,%ymm6 | ||
vmovdqa %ymm9,%ymm7 | ||
vpaddd %xmm8,%xmm0,%xmm0 | ||
vpaddd %xmm1,%xmm0,%xmm0 | ||
vpxor %xmm0,%xmm3,%xmm3 | ||
vprord $0x10,%xmm3,%xmm3 | ||
vpaddd %xmm3,%xmm2,%xmm2 | ||
vpxor %xmm2,%xmm1,%xmm1 | ||
vprord $0xc,%xmm1,%xmm1 | ||
vextracti128 $0x1,%ymm8,%xmm8 | ||
vpaddd %xmm8,%xmm0,%xmm0 | ||
vpaddd %xmm1,%xmm0,%xmm0 | ||
vpxor %xmm0,%xmm3,%xmm3 | ||
vprord $0x8,%xmm3,%xmm3 | ||
vpaddd %xmm3,%xmm2,%xmm2 | ||
vpxor %xmm2,%xmm1,%xmm1 | ||
vprord $0x7,%xmm1,%xmm1 | ||
vpshufd $0x93,%xmm0,%xmm0 | ||
vpshufd $0x4e,%xmm3,%xmm3 | ||
vpshufd $0x39,%xmm2,%xmm2 | ||
vpaddd %xmm9,%xmm0,%xmm0 | ||
vpaddd %xmm1,%xmm0,%xmm0 | ||
vpxor %xmm0,%xmm3,%xmm3 | ||
vprord $0x10,%xmm3,%xmm3 | ||
vpaddd %xmm3,%xmm2,%xmm2 | ||
vpxor %xmm2,%xmm1,%xmm1 | ||
vprord $0xc,%xmm1,%xmm1 | ||
vextracti128 $0x1,%ymm9,%xmm9 | ||
vpaddd %xmm9,%xmm0,%xmm0 | ||
vpaddd %xmm1,%xmm0,%xmm0 | ||
vpxor %xmm0,%xmm3,%xmm3 | ||
vprord $0x8,%xmm3,%xmm3 | ||
vpaddd %xmm3,%xmm2,%xmm2 | ||
vpxor %xmm2,%xmm1,%xmm1 | ||
vprord $0x7,%xmm1,%xmm1 | ||
vpshufd $0x39,%xmm0,%xmm0 | ||
vpshufd $0x4e,%xmm3,%xmm3 | ||
vpshufd $0x93,%xmm2,%xmm2 | ||
decb %cl | ||
jne .Lblake2s_compress_avx512_roundloop | ||
vpxor %xmm10,%xmm0,%xmm0 | ||
vpxor %xmm11,%xmm1,%xmm1 | ||
vpxor %xmm2,%xmm0,%xmm0 | ||
vpxor %xmm3,%xmm1,%xmm1 | ||
decq %rdx | ||
jne .Lblake2s_compress_avx512_mainloop | ||
vmovdqu %xmm0,(%rdi) | ||
vmovdqu %xmm1,0x10(%rdi) | ||
vmovdqu %xmm4,0x20(%rdi) | ||
vzeroupper | ||
retq | ||
ENDPROC(blake2s_compress_avx512) | ||
#endif /* CONFIG_AS_AVX512 */ |
Oops, something went wrong.