@@ -286,9 +286,10 @@ ENTRY(pmull_ghash_update_p8)
286286 __pmull_ghash p8
287287ENDPROC(pmull_ghash_update_p8)
288288
289- KS .req v8
290- CTR .req v9
291- INP .req v10
289+ KS0 .req v8
290+ KS1 .req v9
291+ INP0 .req v10
292+ INP1 .req v11
292293
293294 .macro load_round_keys , rounds , rk
294295 cmp \rounds , # 12
@@ -336,101 +337,167 @@ CPU_LE( rev x8, x8 )
336337
337338 .if \enc == 1
338339 ldr x10 , [ sp ]
339- ld1 {KS .16b} , [ x10 ]
340+ ld1 {KS0.16b - KS1 .16b}, [ x10 ]
340341 .endif
341342
342- 0 : ld1 {CTR.8b } , [ x5 ] // load upper counter
343- ld1 {INP.16b} , [ x3 ], # 16
343+ 0 : ld1 {INP0.16b - INP1.16b }, [ x3 ], # 32
344+
344345 rev x9 , x8
345- add x8 , x8 , # 1
346- sub w0 , w0 , # 1
347- ins CTR.d [ 1 ], x9 // set lower counter
346+ add x11 , x8 , # 1
347+ add x8 , x8 , # 2
348348
349349 .if \enc == 1
350- eor INP .16b , INP .16b , KS .16b // encrypt input
351- st1 {INP .16b} , [ x2 ], # 16
350+ eor INP0 .16b , INP0 .16b , KS0 .16b // encrypt input
351+ eor INP1 .16b , INP1.16b , KS1.16b
352352 .endif
353353
354- rev64 T1.16b , INP.16b
354+ ld1 {KS0.8b} , [ x5 ] // load upper counter
355+ rev x11 , x11
356+ sub w0 , w0 , # 2
357+ mov KS1.8b , KS0.8b
358+ ins KS0.d [ 1 ], x9 // set lower counter
359+ ins KS1.d [ 1 ], x11
360+
361+ rev64 T1.16b , INP0.16b
355362
356363 cmp w7 , # 12
357364 b.ge 2f // AES - 192 / 256 ?
358365
359- 1 : enc_round CTR , v21
366+ 1 : enc_round KS0 , v21
360367
361368 ext T2.16b , XL.16b , XL.16b , # 8
362369 ext IN1.16b , T1.16b , T1.16b , # 8
363370
364- enc_round CTR , v22
371+ enc_round KS1 , v21
365372
366373 eor T1.16b , T1.16b , T2.16b
367374 eor XL.16b , XL.16b , IN1.16b
368375
369- enc_round CTR , v23
376+ enc_round KS0 , v22
370377
371378 pmull2 XH.1q , SHASH.2d , XL.2d // a1 * b1
372379 eor T1.16b , T1.16b , XL.16b
373380
374- enc_round CTR , v24
381+ enc_round KS1 , v22
375382
376383 pmull XL.1q , SHASH.1d , XL.1d // a0 * b0
377384 pmull XM.1q , SHASH2.1d , T1.1d // (a1 + a0)(b1 + b0)
378385
379- enc_round CTR , v25
386+ enc_round KS0 , v23
380387
381388 ext T1.16b , XL.16b , XH.16b , # 8
382389 eor T2.16b , XL.16b , XH.16b
383390 eor XM.16b , XM.16b , T1.16b
384391
385- enc_round CTR , v26
392+ enc_round KS1 , v23
386393
387394 eor XM.16b , XM.16b , T2.16b
388395 pmull T2.1q , XL.1d , MASK.1d
389396
390- enc_round CTR , v27
397+ enc_round KS0 , v24
391398
392399 mov XH.d [ 0 ], XM.d [ 1 ]
393400 mov XM.d [ 1 ], XL.d [ 0 ]
394401
395- enc_round CTR , v28
402+ enc_round KS1 , v24
396403
397404 eor XL.16b , XM.16b , T2.16b
398405
399- enc_round CTR , v29
406+ enc_round KS0 , v25
400407
401408 ext T2.16b , XL.16b , XL.16b , # 8
402409
403- aese CTR.16b , v30.16b
410+ enc_round KS1 , v25
404411
405412 pmull XL.1q , XL.1d , MASK.1d
406413 eor T2.16b , T2.16b , XH.16b
407414
408- eor KS.16b , CTR.16b , v31.16b
415+ enc_round KS0 , v26
416+
417+ eor XL.16b , XL.16b , T2.16b
418+ rev64 T1.16b , INP1.16b
419+
420+ enc_round KS1 , v26
421+
422+ ext T2.16b , XL.16b , XL.16b , # 8
423+ ext IN1.16b , T1.16b , T1.16b , # 8
424+
425+ enc_round KS0 , v27
426+
427+ eor T1.16b , T1.16b , T2.16b
428+ eor XL.16b , XL.16b , IN1.16b
429+
430+ enc_round KS1 , v27
431+
432+ pmull2 XH.1q , SHASH.2d , XL.2d // a1 * b1
433+ eor T1.16b , T1.16b , XL.16b
434+
435+ enc_round KS0 , v28
436+
437+ pmull XL.1q , SHASH.1d , XL.1d // a0 * b0
438+ pmull XM.1q , SHASH2.1d , T1.1d // (a1 + a0)(b1 + b0)
439+
440+ enc_round KS1 , v28
441+
442+ ext T1.16b , XL.16b , XH.16b , # 8
443+ eor T2.16b , XL.16b , XH.16b
444+ eor XM.16b , XM.16b , T1.16b
445+
446+ enc_round KS0 , v29
447+
448+ eor XM.16b , XM.16b , T2.16b
449+ pmull T2.1q , XL.1d , MASK.1d
450+
451+ enc_round KS1 , v29
452+
453+ mov XH.d [ 0 ], XM.d [ 1 ]
454+ mov XM.d [ 1 ], XL.d [ 0 ]
455+
456+ aese KS0.16b , v30.16b
457+
458+ eor XL.16b , XM.16b , T2.16b
459+
460+ aese KS1.16b , v30.16b
461+
462+ ext T2.16b , XL.16b , XL.16b , # 8
463+
464+ eor KS0.16b , KS0.16b , v31.16b
465+
466+ pmull XL.1q , XL.1d , MASK.1d
467+ eor T2.16b , T2.16b , XH.16b
468+
469+ eor KS1.16b , KS1.16b , v31.16b
409470
410471 eor XL.16b , XL.16b , T2.16b
411472
412473 .if \enc == 0
413- eor INP .16b , INP .16b , KS .16b
414- st1 {INP .16b} , [ x2 ], # 16
474+ eor INP0 .16b , INP0 .16b , KS0 .16b
475+ eor INP1 .16b , INP1.16b , KS1.16b
415476 .endif
416477
478+ st1 {INP0.16b - INP1.16b} , [ x2 ], # 32
479+
417480 cbnz w0 , 0b
418481
419482CPU_LE( rev x8 , x8 )
420483 st1 {XL.2d} , [ x1 ]
421484 str x8 , [ x5 , # 8 ] // store lower counter
422485
423486 .if \enc == 1
424- st1 {KS .16b} , [ x10 ]
487+ st1 {KS0.16b - KS1 .16b}, [ x10 ]
425488 .endif
426489
427490 ret
428491
4294922 : b.eq 3f // AES - 192 ?
430- enc_round CTR , v17
431- enc_round CTR , v18
432- 3 : enc_round CTR , v19
433- enc_round CTR , v20
493+ enc_round KS0 , v17
494+ enc_round KS1 , v17
495+ enc_round KS0 , v18
496+ enc_round KS1 , v18
497+ 3 : enc_round KS0 , v19
498+ enc_round KS1 , v19
499+ enc_round KS0 , v20
500+ enc_round KS1 , v20
434501 b 1b
435502 .endm
436503
0 commit comments