Skip to content

Commit 71e52c2

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64/aes-ce-gcm - operate on two input blocks at a time
Update the core AES/GCM transform and the associated plumbing to operate on 2 AES/GHASH blocks at a time. By itself, this is not expected to result in a noticeable speedup, but it paves the way for reimplementing the GHASH component using 2-way aggregation. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent 3465893 commit 71e52c2

File tree

2 files changed

+161
-69
lines changed

2 files changed

+161
-69
lines changed

arch/arm64/crypto/ghash-ce-core.S

Lines changed: 97 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -286,9 +286,10 @@ ENTRY(pmull_ghash_update_p8)
286286
__pmull_ghash p8
287287
ENDPROC(pmull_ghash_update_p8)
288288

289-
KS .req v8
290-
CTR .req v9
291-
INP .req v10
289+
KS0 .req v8
290+
KS1 .req v9
291+
INP0 .req v10
292+
INP1 .req v11
292293

293294
.macro load_round_keys, rounds, rk
294295
cmp \rounds, #12
@@ -336,101 +337,167 @@ CPU_LE( rev x8, x8 )
336337

337338
.if \enc == 1
338339
ldr x10, [sp]
339-
ld1 {KS.16b}, [x10]
340+
ld1 {KS0.16b-KS1.16b}, [x10]
340341
.endif
341342

342-
0: ld1 {CTR.8b}, [x5] // load upper counter
343-
ld1 {INP.16b}, [x3], #16
343+
0: ld1 {INP0.16b-INP1.16b}, [x3], #32
344+
344345
rev x9, x8
345-
add x8, x8, #1
346-
sub w0, w0, #1
347-
ins CTR.d[1], x9 // set lower counter
346+
add x11, x8, #1
347+
add x8, x8, #2
348348

349349
.if \enc == 1
350-
eor INP.16b, INP.16b, KS.16b // encrypt input
351-
st1 {INP.16b}, [x2], #16
350+
eor INP0.16b, INP0.16b, KS0.16b // encrypt input
351+
eor INP1.16b, INP1.16b, KS1.16b
352352
.endif
353353

354-
rev64 T1.16b, INP.16b
354+
ld1 {KS0.8b}, [x5] // load upper counter
355+
rev x11, x11
356+
sub w0, w0, #2
357+
mov KS1.8b, KS0.8b
358+
ins KS0.d[1], x9 // set lower counter
359+
ins KS1.d[1], x11
360+
361+
rev64 T1.16b, INP0.16b
355362

356363
cmp w7, #12
357364
b.ge 2f // AES-192/256?
358365

359-
1: enc_round CTR, v21
366+
1: enc_round KS0, v21
360367

361368
ext T2.16b, XL.16b, XL.16b, #8
362369
ext IN1.16b, T1.16b, T1.16b, #8
363370

364-
enc_round CTR, v22
371+
enc_round KS1, v21
365372

366373
eor T1.16b, T1.16b, T2.16b
367374
eor XL.16b, XL.16b, IN1.16b
368375

369-
enc_round CTR, v23
376+
enc_round KS0, v22
370377

371378
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
372379
eor T1.16b, T1.16b, XL.16b
373380

374-
enc_round CTR, v24
381+
enc_round KS1, v22
375382

376383
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
377384
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
378385

379-
enc_round CTR, v25
386+
enc_round KS0, v23
380387

381388
ext T1.16b, XL.16b, XH.16b, #8
382389
eor T2.16b, XL.16b, XH.16b
383390
eor XM.16b, XM.16b, T1.16b
384391

385-
enc_round CTR, v26
392+
enc_round KS1, v23
386393

387394
eor XM.16b, XM.16b, T2.16b
388395
pmull T2.1q, XL.1d, MASK.1d
389396

390-
enc_round CTR, v27
397+
enc_round KS0, v24
391398

392399
mov XH.d[0], XM.d[1]
393400
mov XM.d[1], XL.d[0]
394401

395-
enc_round CTR, v28
402+
enc_round KS1, v24
396403

397404
eor XL.16b, XM.16b, T2.16b
398405

399-
enc_round CTR, v29
406+
enc_round KS0, v25
400407

401408
ext T2.16b, XL.16b, XL.16b, #8
402409

403-
aese CTR.16b, v30.16b
410+
enc_round KS1, v25
404411

405412
pmull XL.1q, XL.1d, MASK.1d
406413
eor T2.16b, T2.16b, XH.16b
407414

408-
eor KS.16b, CTR.16b, v31.16b
415+
enc_round KS0, v26
416+
417+
eor XL.16b, XL.16b, T2.16b
418+
rev64 T1.16b, INP1.16b
419+
420+
enc_round KS1, v26
421+
422+
ext T2.16b, XL.16b, XL.16b, #8
423+
ext IN1.16b, T1.16b, T1.16b, #8
424+
425+
enc_round KS0, v27
426+
427+
eor T1.16b, T1.16b, T2.16b
428+
eor XL.16b, XL.16b, IN1.16b
429+
430+
enc_round KS1, v27
431+
432+
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
433+
eor T1.16b, T1.16b, XL.16b
434+
435+
enc_round KS0, v28
436+
437+
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
438+
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
439+
440+
enc_round KS1, v28
441+
442+
ext T1.16b, XL.16b, XH.16b, #8
443+
eor T2.16b, XL.16b, XH.16b
444+
eor XM.16b, XM.16b, T1.16b
445+
446+
enc_round KS0, v29
447+
448+
eor XM.16b, XM.16b, T2.16b
449+
pmull T2.1q, XL.1d, MASK.1d
450+
451+
enc_round KS1, v29
452+
453+
mov XH.d[0], XM.d[1]
454+
mov XM.d[1], XL.d[0]
455+
456+
aese KS0.16b, v30.16b
457+
458+
eor XL.16b, XM.16b, T2.16b
459+
460+
aese KS1.16b, v30.16b
461+
462+
ext T2.16b, XL.16b, XL.16b, #8
463+
464+
eor KS0.16b, KS0.16b, v31.16b
465+
466+
pmull XL.1q, XL.1d, MASK.1d
467+
eor T2.16b, T2.16b, XH.16b
468+
469+
eor KS1.16b, KS1.16b, v31.16b
409470

410471
eor XL.16b, XL.16b, T2.16b
411472

412473
.if \enc == 0
413-
eor INP.16b, INP.16b, KS.16b
414-
st1 {INP.16b}, [x2], #16
474+
eor INP0.16b, INP0.16b, KS0.16b
475+
eor INP1.16b, INP1.16b, KS1.16b
415476
.endif
416477

478+
st1 {INP0.16b-INP1.16b}, [x2], #32
479+
417480
cbnz w0, 0b
418481

419482
CPU_LE( rev x8, x8 )
420483
st1 {XL.2d}, [x1]
421484
str x8, [x5, #8] // store lower counter
422485

423486
.if \enc == 1
424-
st1 {KS.16b}, [x10]
487+
st1 {KS0.16b-KS1.16b}, [x10]
425488
.endif
426489

427490
ret
428491

429492
2: b.eq 3f // AES-192?
430-
enc_round CTR, v17
431-
enc_round CTR, v18
432-
3: enc_round CTR, v19
433-
enc_round CTR, v20
493+
enc_round KS0, v17
494+
enc_round KS1, v17
495+
enc_round KS0, v18
496+
enc_round KS1, v18
497+
3: enc_round KS0, v19
498+
enc_round KS1, v19
499+
enc_round KS0, v20
500+
enc_round KS1, v20
434501
b 1b
435502
.endm
436503

0 commit comments

Comments
 (0)