Skip to content

Commit

Permalink
Take vshufi64x2 out of main loop and initialize upfront (for parallel…
Browse files Browse the repository at this point in the history
… 81 only) (#139)
  • Loading branch information
fwessels authored May 13, 2020
1 parent d5afb5f commit d6d9fba
Showing 1 changed file with 35 additions and 11 deletions.
46 changes: 35 additions & 11 deletions galoisAvx512_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@
VPANDQ Z2, Z0, Z0 \ // low input
VPANDQ Z2, Z1, Z1 // high input

#define GALOIS_MUL(MUL_LO, MUL_HI, LO, HI, OUT) \
VPSHUFB Z0, MUL_LO, LO \ // mul low part
VPSHUFB Z1, MUL_HI, HI \ // mul high part
VPTERNLOGD $0x96, LO, HI, OUT

#define GALOIS(C1, C2, IN, LO, HI, OUT) \
VSHUFI64X2 $C1, IN, IN, LO \
VSHUFI64X2 $C2, IN, IN, HI \
VPSHUFB Z0, LO, LO \ // mul low part
VPSHUFB Z1, HI, HI \ // mul high part
VPTERNLOGD $0x96, LO, HI, OUT
GALOIS_MUL(LO, HI, LO, HI, OUT)

//
// Process single output row from a total of 8 input rows
Expand All @@ -34,6 +37,27 @@ TEXT ·_galMulAVX512Parallel81(SB), 7, $0
VMOVDQU64 0x080(SI), Z18
VMOVDQU64 0x0c0(SI), Z19

// Initialize multiplication constants
VSHUFI64X2 $0x55, Z16, Z16, Z20
VSHUFI64X2 $0xaa, Z16, Z16, Z24
VSHUFI64X2 $0xff, Z16, Z16, Z28
VSHUFI64X2 $0x00, Z16, Z16, Z16

VSHUFI64X2 $0x55, Z17, Z17, Z21
VSHUFI64X2 $0xaa, Z17, Z17, Z25
VSHUFI64X2 $0xff, Z17, Z17, Z29
VSHUFI64X2 $0x00, Z17, Z17, Z17

VSHUFI64X2 $0x55, Z18, Z18, Z22
VSHUFI64X2 $0xaa, Z18, Z18, Z26
VSHUFI64X2 $0xff, Z18, Z18, Z30
VSHUFI64X2 $0x00, Z18, Z18, Z18

VSHUFI64X2 $0x55, Z19, Z19, Z23
VSHUFI64X2 $0xaa, Z19, Z19, Z27
VSHUFI64X2 $0xff, Z19, Z19, Z31
VSHUFI64X2 $0x00, Z19, Z19, Z19

MOVQ $15, BX
VPBROADCASTB BX, Z2

Expand All @@ -50,49 +74,49 @@ loopback_avx512_parallel81:
VMOVDQU64.Z (DX), K1, Z4

LOAD(0x00) // &in[0][0]
GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
GALOIS_MUL(Z16, Z20, Z14, Z15, Z4)

CMPQ AX, $1
JE skip_avx512_parallel81

LOAD(0x18) // &in[1][0]
GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
GALOIS_MUL(Z24, Z28, Z14, Z15, Z4)

CMPQ AX, $2
JE skip_avx512_parallel81

LOAD(0x30) // &in[2][0]
GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
GALOIS_MUL(Z17, Z21, Z14, Z15, Z4)

CMPQ AX, $3
JE skip_avx512_parallel81

LOAD(0x48) // &in[3][0]
GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
GALOIS_MUL(Z25, Z29, Z14, Z15, Z4)

CMPQ AX, $4
JE skip_avx512_parallel81

LOAD(0x60) // &in[4][0]
GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
GALOIS_MUL(Z18, Z22, Z14, Z15, Z4)

CMPQ AX, $5
JE skip_avx512_parallel81

LOAD(0x78) // &in[5][0]
GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
GALOIS_MUL(Z26, Z30, Z14, Z15, Z4)

CMPQ AX, $6
JE skip_avx512_parallel81

LOAD(0x90) // &in[6][0]
GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
GALOIS_MUL(Z19, Z23, Z14, Z15, Z4)

CMPQ AX, $7
JE skip_avx512_parallel81

LOAD(0xa8) // &in[7][0]
GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
GALOIS_MUL(Z27, Z31, Z14, Z15, Z4)

skip_avx512_parallel81:
VMOVDQU64 Z4, (DX)
Expand Down

0 comments on commit d6d9fba

Please sign in to comment.