diff --git a/galois_amd64.s b/galois_amd64.s index 57afb4dc..4a0db78c 100644 --- a/galois_amd64.s +++ b/galois_amd64.s @@ -87,31 +87,31 @@ TEXT ·galMulAVX2Xor(SB), 7, $0 MOVQ high+24(FP), DX // DX: &high MOVQ $15, BX // BX: low mask MOVQ BX, X5 - MOVOU (SI), X6 // X6 low + MOVOU (SI), X6 // X6: low MOVOU (DX), X7 // X7: high MOVQ in_len+56(FP), R9 // R9: len(in) - LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low - LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high - LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked) + VINSERTI128 $1, X6, Y6, Y6 // low + VINSERTI128 $1, X7, Y7, Y7 // high + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) - SHRQ $5, R9 // len(in) /32 + SHRQ $5, R9 // len(in) / 32 MOVQ out+72(FP), DX // DX: &out - MOVQ in+48(FP), SI // R11: &in + MOVQ in+48(FP), SI // SI: &in TESTQ R9, R9 JZ done_xor_avx2 loopback_xor_avx2: - LONG $0x066ffec5 // VMOVDQU YMM0, [rsi] - LONG $0x226ffec5 // VMOVDQU YMM4, [rdx] - LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input - LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input - LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input - LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part - LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part - LONG $0xdbefedc5 // VPXOR YMM3, YMM2, YMM3 ; X3: Result - LONG $0xe4efe5c5 // VPXOR YMM4, YMM3, YMM4 ; X4: Result - LONG $0x227ffec5 // VMOVDQU [rdx], YMM4 + VMOVDQU (SI), Y0 + VMOVDQU (DX), Y4 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y1, Y1 // Y1: high input + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPXOR Y3, Y2, Y3 // Y3: Result + VPXOR Y4, Y3, Y4 // Y4: Result + VMOVDQU Y4, (DX) ADDQ $32, SI // in+=32 ADDQ $32, DX // out+=32 @@ -119,8 +119,7 @@ loopback_xor_avx2: JNZ loopback_xor_avx2 done_xor_avx2: - // VZEROUPPER - BYTE $0xc5; BYTE $0xf8; BYTE $0x77 + VZEROUPPER RET // func galMulAVX2(low, high, in, out []byte) @@ -129,29 +128,29 @@ TEXT ·galMulAVX2(SB), 7, $0 MOVQ high+24(FP), DX // DX: &high MOVQ $15, BX // BX: low mask MOVQ BX, X5 - MOVOU (SI), X6 // X6 low + MOVOU (SI), X6 // X6: low MOVOU (DX), X7 // X7: high MOVQ in_len+56(FP), R9 // R9: len(in) - LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low - LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high - LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked) + VINSERTI128 $1, X6, Y6, Y6 // low + VINSERTI128 $1, X7, Y7, Y7 // high + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) - SHRQ $5, R9 // len(in) /32 + SHRQ $5, R9 // len(in) / 32 MOVQ out+72(FP), DX // DX: &out - MOVQ in+48(FP), SI // R11: &in + MOVQ in+48(FP), SI // SI: &in TESTQ R9, R9 JZ done_avx2 loopback_avx2: - LONG $0x066ffec5 // VMOVDQU YMM0, [rsi] - LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input - LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input - LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input - LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part - LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part - LONG $0xe3efedc5 // VPXOR YMM4, YMM2, YMM3 ; X4: Result - LONG $0x227ffec5 // VMOVDQU [rdx], YMM4 + VMOVDQU (SI), Y0 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y1, Y1 // Y1: high input + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPXOR Y3, Y2, Y4 // Y4: Result + VMOVDQU Y4, (DX) ADDQ $32, SI // in+=32 ADDQ $32, DX // out+=32 @@ -159,8 +158,7 @@ loopback_avx2: JNZ loopback_avx2 done_avx2: - - BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + VZEROUPPER RET // func sSE2XorSlice(in, out []byte) diff --git a/galois_test.go b/galois_test.go index 120c93f6..707c804c 100644 --- a/galois_test.go +++ b/galois_test.go @@ -116,7 +116,7 @@ func TestExp(t *testing.T) { } } -func TestGalois(t *testing.T) { +func testGalois(t *testing.T, ssse3, avx2 bool) { // These values were copied output of the Python code. if galMultiply(3, 4) != 12 { t.Fatal("galMultiply(3, 4) != 12") @@ -131,25 +131,25 @@ func TestGalois(t *testing.T) { // Test slices (>32 entries to test assembler -- AVX2 & NEON) in := []byte{0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185} out := make([]byte, len(in)) - galMulSlice(25, in, out, false, false) + galMulSlice(25, in, out, ssse3, avx2) expect := []byte{0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, 0x22, 0x7, 0x25, 0xfe, 0xda, 0x5d, 0x44, 0x6f, 0x76, 0x39, 0x20, 0xb, 0x12, 0x11, 0x8, 0x23, 0x3a, 0x75, 0x6c, 0x47} if 0 != bytes.Compare(out, expect) { t.Errorf("got %#v, expected %#v", out, expect) } expectXor := []byte{0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0x2f, 0x79, 0xf2, 0x7, 0x51, 0xd4, 0x19, 0x31, 0xc9, 0xf8, 0xfc, 0xf9, 0x4f, 0x62, 0x15, 0x38, 0xfb, 0xd6, 0xa1, 0x8c, 0x96, 0xbb, 0xcc, 0xe1, 0x22, 0xf, 0x78} - galMulSliceXor(52, in, out, false, false) + galMulSliceXor(52, in, out, ssse3, avx2) if 0 != bytes.Compare(out, expectXor) { t.Errorf("got %#v, expected %#v", out, expectXor) } - galMulSlice(177, in, out, false, false) + galMulSlice(177, in, out, ssse3, avx2) expect = []byte{0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, 0xa3, 0x95, 0xcb, 0xc, 0xdd, 0x6c, 0xa2, 0x13, 0x23, 0x92, 0x5c, 0xed, 0x1b, 0xaa, 0x64, 0xd5, 0xe5, 0x54, 0x9a} if 0 != bytes.Compare(out, expect) { t.Errorf("got %#v, expected %#v", out, expect) } expectXor = []byte{0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0xfb, 0xec, 0xc5, 0xd0, 0xc7, 0x53, 0x88, 0xa3, 0xa5, 0x6, 0x78, 0x97, 0x9f, 0x5b, 0xa, 0xce, 0xa8, 0x6c, 0x3d, 0xf9, 0xdf, 0x1b, 0x4a, 0x8e, 0xe8, 0x2c, 0x7d} - galMulSliceXor(117, in, out, false, false) + galMulSliceXor(117, in, out, ssse3, avx2) if 0 != bytes.Compare(out, expectXor) { t.Errorf("got %#v, expected %#v", out, expectXor) } @@ -165,6 +165,15 @@ func TestGalois(t *testing.T) { } } +func TestGalois(t *testing.T) { + // invoke with all combinations of asm instructions + testGalois(t, false, false) + testGalois(t, true, false) + if defaultOptions.useAVX2 { + testGalois(t, false, true) + } +} + func TestSliceGalADD(t *testing.T) { lengthList := []int{16, 32, 34}