Skip to content

Commit

Permalink
crypto: arm64/aes-ce-cipher - move assembler code to .S file
Browse files Browse the repository at this point in the history
Most crypto drivers involving kernel mode NEON take care to put the code
that actually touches the NEON register file in a separate compilation
unit, to prevent the compiler from reordering code that preserves or
restores the NEON context with code that may corrupt it. This is
necessary because we currently have no way to express the restrictions
imposed upon use of the NEON in kernel mode in a way that the compiler
understands.

However, in the case of aes-ce-cipher, it did not seem unreasonable to
deviate from this rule, given how it does not seem possible for the
compiler to reorder cross object function calls with asm blocks whose
in- and output constraints reflect that it reads from and writes to
memory.

Now that LTO is being proposed for the arm64 kernel, it is time to
revisit this. The link time optimization may replace the function
calls to kernel_neon_begin() and kernel_neon_end() with instantiations
of the IR that make up its implementation, allowing further reordering
with the asm block.

So let's clean this up, and move the asm() blocks into a separate .S
file.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-By: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  • Loading branch information
Ard Biesheuvel authored and herbertx committed Nov 29, 2017
1 parent 1964e33 commit 019cd46
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 104 deletions.
2 changes: 1 addition & 1 deletion arch/arm64/crypto/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o

obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o

obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
Expand Down
87 changes: 87 additions & 0 deletions arch/arm64/crypto/aes-ce-core.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/

#include <linux/linkage.h>
#include <asm/assembler.h>

.arch armv8-a+crypto

ENTRY(__aes_ce_encrypt)
sub w3, w3, #2
ld1 {v0.16b}, [x2]
ld1 {v1.4s}, [x0], #16
cmp w3, #10
bmi 0f
bne 3f
mov v3.16b, v1.16b
b 2f
0: mov v2.16b, v1.16b
ld1 {v3.4s}, [x0], #16
1: aese v0.16b, v2.16b
aesmc v0.16b, v0.16b
2: ld1 {v1.4s}, [x0], #16
aese v0.16b, v3.16b
aesmc v0.16b, v0.16b
3: ld1 {v2.4s}, [x0], #16
subs w3, w3, #3
aese v0.16b, v1.16b
aesmc v0.16b, v0.16b
ld1 {v3.4s}, [x0], #16
bpl 1b
aese v0.16b, v2.16b
eor v0.16b, v0.16b, v3.16b
st1 {v0.16b}, [x1]
ret
ENDPROC(__aes_ce_encrypt)

ENTRY(__aes_ce_decrypt)
sub w3, w3, #2
ld1 {v0.16b}, [x2]
ld1 {v1.4s}, [x0], #16
cmp w3, #10
bmi 0f
bne 3f
mov v3.16b, v1.16b
b 2f
0: mov v2.16b, v1.16b
ld1 {v3.4s}, [x0], #16
1: aesd v0.16b, v2.16b
aesimc v0.16b, v0.16b
2: ld1 {v1.4s}, [x0], #16
aesd v0.16b, v3.16b
aesimc v0.16b, v0.16b
3: ld1 {v2.4s}, [x0], #16
subs w3, w3, #3
aesd v0.16b, v1.16b
aesimc v0.16b, v0.16b
ld1 {v3.4s}, [x0], #16
bpl 1b
aesd v0.16b, v2.16b
eor v0.16b, v0.16b, v3.16b
st1 {v0.16b}, [x1]
ret
ENDPROC(__aes_ce_decrypt)

/*
* __aes_ce_sub() - use the aese instruction to perform the AES sbox
* substitution on each byte in 'input'
*/
ENTRY(__aes_ce_sub)
dup v1.4s, w0
movi v0.16b, #0
aese v0.16b, v1.16b
umov w0, v0.s[0]
ret
ENDPROC(__aes_ce_sub)

ENTRY(__aes_ce_invert)
ld1 {v0.4s}, [x1]
aesimc v1.16b, v0.16b
st1 {v1.4s}, [x0]
ret
ENDPROC(__aes_ce_invert)
115 changes: 12 additions & 103 deletions arch/arm64/crypto/aes-ce-cipher.c → arch/arm64/crypto/aes-ce-glue.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ struct aes_block {
u8 b[AES_BLOCK_SIZE];
};

asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);

asmlinkage u32 __aes_ce_sub(u32 l);
asmlinkage void __aes_ce_invert(struct aes_block *out,
const struct aes_block *in);

static int num_rounds(struct crypto_aes_ctx *ctx)
{
/*
Expand All @@ -44,123 +51,31 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
struct aes_block *out = (struct aes_block *)dst;
struct aes_block const *in = (struct aes_block *)src;
void *dummy0;
int dummy1;

if (!may_use_simd()) {
__aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
return;
}

kernel_neon_begin();

__asm__(" ld1 {v0.16b}, %[in] ;"
" ld1 {v1.4s}, [%[key]], #16 ;"
" cmp %w[rounds], #10 ;"
" bmi 0f ;"
" bne 3f ;"
" mov v3.16b, v1.16b ;"
" b 2f ;"
"0: mov v2.16b, v1.16b ;"
" ld1 {v3.4s}, [%[key]], #16 ;"
"1: aese v0.16b, v2.16b ;"
" aesmc v0.16b, v0.16b ;"
"2: ld1 {v1.4s}, [%[key]], #16 ;"
" aese v0.16b, v3.16b ;"
" aesmc v0.16b, v0.16b ;"
"3: ld1 {v2.4s}, [%[key]], #16 ;"
" subs %w[rounds], %w[rounds], #3 ;"
" aese v0.16b, v1.16b ;"
" aesmc v0.16b, v0.16b ;"
" ld1 {v3.4s}, [%[key]], #16 ;"
" bpl 1b ;"
" aese v0.16b, v2.16b ;"
" eor v0.16b, v0.16b, v3.16b ;"
" st1 {v0.16b}, %[out] ;"

: [out] "=Q"(*out),
[key] "=r"(dummy0),
[rounds] "=r"(dummy1)
: [in] "Q"(*in),
"1"(ctx->key_enc),
"2"(num_rounds(ctx) - 2)
: "cc");

__aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
kernel_neon_end();
}

static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
struct aes_block *out = (struct aes_block *)dst;
struct aes_block const *in = (struct aes_block *)src;
void *dummy0;
int dummy1;

if (!may_use_simd()) {
__aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
return;
}

kernel_neon_begin();

__asm__(" ld1 {v0.16b}, %[in] ;"
" ld1 {v1.4s}, [%[key]], #16 ;"
" cmp %w[rounds], #10 ;"
" bmi 0f ;"
" bne 3f ;"
" mov v3.16b, v1.16b ;"
" b 2f ;"
"0: mov v2.16b, v1.16b ;"
" ld1 {v3.4s}, [%[key]], #16 ;"
"1: aesd v0.16b, v2.16b ;"
" aesimc v0.16b, v0.16b ;"
"2: ld1 {v1.4s}, [%[key]], #16 ;"
" aesd v0.16b, v3.16b ;"
" aesimc v0.16b, v0.16b ;"
"3: ld1 {v2.4s}, [%[key]], #16 ;"
" subs %w[rounds], %w[rounds], #3 ;"
" aesd v0.16b, v1.16b ;"
" aesimc v0.16b, v0.16b ;"
" ld1 {v3.4s}, [%[key]], #16 ;"
" bpl 1b ;"
" aesd v0.16b, v2.16b ;"
" eor v0.16b, v0.16b, v3.16b ;"
" st1 {v0.16b}, %[out] ;"

: [out] "=Q"(*out),
[key] "=r"(dummy0),
[rounds] "=r"(dummy1)
: [in] "Q"(*in),
"1"(ctx->key_dec),
"2"(num_rounds(ctx) - 2)
: "cc");

__aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
kernel_neon_end();
}

/*
* aes_sub() - use the aese instruction to perform the AES sbox substitution
* on each byte in 'input'
*/
static u32 aes_sub(u32 input)
{
u32 ret;

__asm__("dup v1.4s, %w[in] ;"
"movi v0.16b, #0 ;"
"aese v0.16b, v1.16b ;"
"umov %w[out], v0.4s[0] ;"

: [out] "=r"(ret)
: [in] "r"(input)
: "v0","v1");

return ret;
}

int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len)
{
Expand Down Expand Up @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
u32 *rki = ctx->key_enc + (i * kwords);
u32 *rko = rki + kwords;

rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
rko[1] = rko[0] ^ rki[1];
rko[2] = rko[1] ^ rki[2];
rko[3] = rko[2] ^ rki[3];
Expand All @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
} else if (key_len == AES_KEYSIZE_256) {
if (i >= 6)
break;
rko[4] = aes_sub(rko[3]) ^ rki[4];
rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
rko[5] = rko[4] ^ rki[5];
rko[6] = rko[5] ^ rki[6];
rko[7] = rko[6] ^ rki[7];
Expand All @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,

key_dec[0] = key_enc[j];
for (i = 1, j--; j > 0; i++, j--)
__asm__("ld1 {v0.4s}, %[in] ;"
"aesimc v1.16b, v0.16b ;"
"st1 {v1.4s}, %[out] ;"

: [out] "=Q"(key_dec[i])
: [in] "Q"(key_enc[j])
: "v0","v1");
__aes_ce_invert(key_dec + i, key_enc + j);
key_dec[i] = key_enc[0];

kernel_neon_end();
Expand Down

0 comments on commit 019cd46

Please sign in to comment.