Skip to content

Commit

Permalink
Use AVX2 SIMD assembly instructions in favor of BYTE sequences. (#73)
Browse files Browse the repository at this point in the history
* Use AVX2 SIMD assembly instructions in favor of BYTE sequences.
  • Loading branch information
fwessels authored and klauspost committed Nov 18, 2017
1 parent 6bb6130 commit 3610933
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 39 deletions.
66 changes: 32 additions & 34 deletions galois_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -87,40 +87,39 @@ TEXT ·galMulAVX2Xor(SB), 7, $0
MOVQ high+24(FP), DX // DX: &high
MOVQ $15, BX // BX: low mask
MOVQ BX, X5
MOVOU (SI), X6 // X6 low
MOVOU (SI), X6 // X6: low
MOVOU (DX), X7 // X7: high
MOVQ in_len+56(FP), R9 // R9: len(in)

LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
VINSERTI128 $1, X6, Y6, Y6 // low
VINSERTI128 $1, X7, Y7, Y7 // high
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)

SHRQ $5, R9 // len(in) /32
SHRQ $5, R9 // len(in) / 32
MOVQ out+72(FP), DX // DX: &out
MOVQ in+48(FP), SI // R11: &in
MOVQ in+48(FP), SI // SI: &in
TESTQ R9, R9
JZ done_xor_avx2

loopback_xor_avx2:
LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]
LONG $0x226ffec5 // VMOVDQU YMM4, [rdx]
LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input
LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input
LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input
LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
LONG $0xdbefedc5 // VPXOR YMM3, YMM2, YMM3 ; X3: Result
LONG $0xe4efe5c5 // VPXOR YMM4, YMM3, YMM4 ; X4: Result
LONG $0x227ffec5 // VMOVDQU [rdx], YMM4
VMOVDQU (SI), Y0
VMOVDQU (DX), Y4
VPSRLQ $4, Y0, Y1 // Y1: high input
VPAND Y8, Y0, Y0 // Y0: low input
VPAND Y8, Y1, Y1 // Y1: high input
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
VPXOR Y3, Y2, Y3 // Y3: Result
VPXOR Y4, Y3, Y4 // Y4: Result
VMOVDQU Y4, (DX)

ADDQ $32, SI // in+=32
ADDQ $32, DX // out+=32
SUBQ $1, R9
JNZ loopback_xor_avx2

done_xor_avx2:
// VZEROUPPER
BYTE $0xc5; BYTE $0xf8; BYTE $0x77
VZEROUPPER
RET

// func galMulAVX2(low, high, in, out []byte)
Expand All @@ -129,38 +128,37 @@ TEXT ·galMulAVX2(SB), 7, $0
MOVQ high+24(FP), DX // DX: &high
MOVQ $15, BX // BX: low mask
MOVQ BX, X5
MOVOU (SI), X6 // X6 low
MOVOU (SI), X6 // X6: low
MOVOU (DX), X7 // X7: high
MOVQ in_len+56(FP), R9 // R9: len(in)

LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
VINSERTI128 $1, X6, Y6, Y6 // low
VINSERTI128 $1, X7, Y7, Y7 // high
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)

SHRQ $5, R9 // len(in) /32
SHRQ $5, R9 // len(in) / 32
MOVQ out+72(FP), DX // DX: &out
MOVQ in+48(FP), SI // R11: &in
MOVQ in+48(FP), SI // SI: &in
TESTQ R9, R9
JZ done_avx2

loopback_avx2:
LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]
LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input
LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input
LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input
LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
LONG $0xe3efedc5 // VPXOR YMM4, YMM2, YMM3 ; X4: Result
LONG $0x227ffec5 // VMOVDQU [rdx], YMM4
VMOVDQU (SI), Y0
VPSRLQ $4, Y0, Y1 // Y1: high input
VPAND Y8, Y0, Y0 // Y0: low input
VPAND Y8, Y1, Y1 // Y1: high input
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
VPXOR Y3, Y2, Y4 // Y4: Result
VMOVDQU Y4, (DX)

ADDQ $32, SI // in+=32
ADDQ $32, DX // out+=32
SUBQ $1, R9
JNZ loopback_avx2

done_avx2:

BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
VZEROUPPER
RET

// func sSE2XorSlice(in, out []byte)
Expand Down
19 changes: 14 additions & 5 deletions galois_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ func TestExp(t *testing.T) {
}
}

func TestGalois(t *testing.T) {
func testGalois(t *testing.T, ssse3, avx2 bool) {
// These values were copied output of the Python code.
if galMultiply(3, 4) != 12 {
t.Fatal("galMultiply(3, 4) != 12")
Expand All @@ -131,25 +131,25 @@ func TestGalois(t *testing.T) {
// Test slices (>32 entries to test assembler -- AVX2 & NEON)
in := []byte{0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185}
out := make([]byte, len(in))
galMulSlice(25, in, out, false, false)
galMulSlice(25, in, out, ssse3, avx2)
expect := []byte{0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, 0x22, 0x7, 0x25, 0xfe, 0xda, 0x5d, 0x44, 0x6f, 0x76, 0x39, 0x20, 0xb, 0x12, 0x11, 0x8, 0x23, 0x3a, 0x75, 0x6c, 0x47}
if 0 != bytes.Compare(out, expect) {
t.Errorf("got %#v, expected %#v", out, expect)
}
expectXor := []byte{0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0x2f, 0x79, 0xf2, 0x7, 0x51, 0xd4, 0x19, 0x31, 0xc9, 0xf8, 0xfc, 0xf9, 0x4f, 0x62, 0x15, 0x38, 0xfb, 0xd6, 0xa1, 0x8c, 0x96, 0xbb, 0xcc, 0xe1, 0x22, 0xf, 0x78}
galMulSliceXor(52, in, out, false, false)
galMulSliceXor(52, in, out, ssse3, avx2)
if 0 != bytes.Compare(out, expectXor) {
t.Errorf("got %#v, expected %#v", out, expectXor)
}

galMulSlice(177, in, out, false, false)
galMulSlice(177, in, out, ssse3, avx2)
expect = []byte{0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, 0xa3, 0x95, 0xcb, 0xc, 0xdd, 0x6c, 0xa2, 0x13, 0x23, 0x92, 0x5c, 0xed, 0x1b, 0xaa, 0x64, 0xd5, 0xe5, 0x54, 0x9a}
if 0 != bytes.Compare(out, expect) {
t.Errorf("got %#v, expected %#v", out, expect)
}

expectXor = []byte{0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0xfb, 0xec, 0xc5, 0xd0, 0xc7, 0x53, 0x88, 0xa3, 0xa5, 0x6, 0x78, 0x97, 0x9f, 0x5b, 0xa, 0xce, 0xa8, 0x6c, 0x3d, 0xf9, 0xdf, 0x1b, 0x4a, 0x8e, 0xe8, 0x2c, 0x7d}
galMulSliceXor(117, in, out, false, false)
galMulSliceXor(117, in, out, ssse3, avx2)
if 0 != bytes.Compare(out, expectXor) {
t.Errorf("got %#v, expected %#v", out, expectXor)
}
Expand All @@ -165,6 +165,15 @@ func TestGalois(t *testing.T) {
}
}

func TestGalois(t *testing.T) {
// invoke with all combinations of asm instructions
testGalois(t, false, false)
testGalois(t, true, false)
if defaultOptions.useAVX2 {
testGalois(t, false, true)
}
}

func TestSliceGalADD(t *testing.T) {

lengthList := []int{16, 32, 34}
Expand Down

0 comments on commit 3610933

Please sign in to comment.