Skip to content

Commit

Permalink
add asm vsub
Browse files Browse the repository at this point in the history
  • Loading branch information
herumi committed Aug 23, 2024
1 parent 6206f2f commit 0ae11a9
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 109 deletions.
205 changes: 100 additions & 105 deletions src/asm/bint-x64-amd64.S
Original file line number Diff line number Diff line change
Expand Up @@ -127,100 +127,6 @@ vpcmpgtq %zmm2, %zmm1, %k1
vzeroupper
ret
SIZE(mcl_c5_vsubPre)
.global PRE(mcl_c5_vadd)
PRE(mcl_c5_vadd):
TYPE(mcl_c5_vadd)
mov $4503599627370495, %rax
vpbroadcastq %rax, %zmm16
vmovdqa64 (%rsi), %zmm0
vpaddq (%rdx), %zmm0, %zmm0
vpsrlq $52, %zmm0, %zmm17
vpandq %zmm16, %zmm0, %zmm0
vmovdqa64 64(%rsi), %zmm1
vpaddq 64(%rdx), %zmm1, %zmm1
vpaddq %zmm17, %zmm1, %zmm1
vpsrlq $52, %zmm1, %zmm17
vpandq %zmm16, %zmm1, %zmm1
vmovdqa64 128(%rsi), %zmm2
vpaddq 128(%rdx), %zmm2, %zmm2
vpaddq %zmm17, %zmm2, %zmm2
vpsrlq $52, %zmm2, %zmm17
vpandq %zmm16, %zmm2, %zmm2
vmovdqa64 192(%rsi), %zmm3
vpaddq 192(%rdx), %zmm3, %zmm3
vpaddq %zmm17, %zmm3, %zmm3
vpsrlq $52, %zmm3, %zmm17
vpandq %zmm16, %zmm3, %zmm3
vmovdqa64 256(%rsi), %zmm4
vpaddq 256(%rdx), %zmm4, %zmm4
vpaddq %zmm17, %zmm4, %zmm4
vpsrlq $52, %zmm4, %zmm17
vpandq %zmm16, %zmm4, %zmm4
vmovdqa64 320(%rsi), %zmm5
vpaddq 320(%rdx), %zmm5, %zmm5
vpaddq %zmm17, %zmm5, %zmm5
vpsrlq $52, %zmm5, %zmm17
vpandq %zmm16, %zmm5, %zmm5
vmovdqa64 384(%rsi), %zmm6
vpaddq 384(%rdx), %zmm6, %zmm6
vpaddq %zmm17, %zmm6, %zmm6
vpsrlq $52, %zmm6, %zmm17
vpandq %zmm16, %zmm6, %zmm6
vmovdqa64 448(%rsi), %zmm7
vpaddq 448(%rdx), %zmm7, %zmm7
vpaddq %zmm17, %zmm7, %zmm7
vpsubq PRE(p)(%rip){1to8}, %zmm0, %zmm8
vpsrlq $63, %zmm8, %zmm17
vpandq %zmm16, %zmm8, %zmm8
vpsubq PRE(p)+8(%rip){1to8}, %zmm1, %zmm9
vpsubq %zmm17, %zmm9, %zmm9
vpsrlq $63, %zmm9, %zmm17
vpandq %zmm16, %zmm9, %zmm9
vpsubq PRE(p)+16(%rip){1to8}, %zmm2, %zmm10
vpsubq %zmm17, %zmm10, %zmm10
vpsrlq $63, %zmm10, %zmm17
vpandq %zmm16, %zmm10, %zmm10
vpsubq PRE(p)+24(%rip){1to8}, %zmm3, %zmm11
vpsubq %zmm17, %zmm11, %zmm11
vpsrlq $63, %zmm11, %zmm17
vpandq %zmm16, %zmm11, %zmm11
vpsubq PRE(p)+32(%rip){1to8}, %zmm4, %zmm12
vpsubq %zmm17, %zmm12, %zmm12
vpsrlq $63, %zmm12, %zmm17
vpandq %zmm16, %zmm12, %zmm12
vpsubq PRE(p)+40(%rip){1to8}, %zmm5, %zmm13
vpsubq %zmm17, %zmm13, %zmm13
vpsrlq $63, %zmm13, %zmm17
vpandq %zmm16, %zmm13, %zmm13
vpsubq PRE(p)+48(%rip){1to8}, %zmm6, %zmm14
vpsubq %zmm17, %zmm14, %zmm14
vpsrlq $63, %zmm14, %zmm17
vpandq %zmm16, %zmm14, %zmm14
vpsubq PRE(p)+56(%rip){1to8}, %zmm7, %zmm15
vpsubq %zmm17, %zmm15, %zmm15
vpsrlq $63, %zmm15, %zmm17
vpandq %zmm16, %zmm15, %zmm15
vpxorq %zmm16, %zmm16, %zmm16
vpcmpgtq %zmm16, %zmm17, %k1
vmovdqa64 %zmm0, %zmm8{%k1}
vmovdqa64 %zmm1, %zmm9{%k1}
vmovdqa64 %zmm2, %zmm10{%k1}
vmovdqa64 %zmm3, %zmm11{%k1}
vmovdqa64 %zmm4, %zmm12{%k1}
vmovdqa64 %zmm5, %zmm13{%k1}
vmovdqa64 %zmm6, %zmm14{%k1}
vmovdqa64 %zmm7, %zmm15{%k1}
vmovdqa64 %zmm8, (%rdi)
vmovdqa64 %zmm9, 64(%rdi)
vmovdqa64 %zmm10, 128(%rdi)
vmovdqa64 %zmm11, 192(%rdi)
vmovdqa64 %zmm12, 256(%rdi)
vmovdqa64 %zmm13, 320(%rdi)
vmovdqa64 %zmm14, 384(%rdi)
vmovdqa64 %zmm15, 448(%rdi)
vzeroupper
ret
SIZE(mcl_c5_vadd)
.global PRE(mcl_c5_vaddPreA)
PRE(mcl_c5_vaddPreA):
TYPE(mcl_c5_vaddPreA)
Expand Down Expand Up @@ -424,13 +330,11 @@ vpcmpgtq %zmm3, %zmm2, %k2
vzeroupper
ret
SIZE(mcl_c5_vsubPreA)
.global PRE(mcl_c5_vaddA)
PRE(mcl_c5_vaddA):
TYPE(mcl_c5_vaddA)
.global PRE(mcl_c5_vadd)
PRE(mcl_c5_vadd):
TYPE(mcl_c5_vadd)
mov $4503599627370495, %rax
vpbroadcastq %rax, %zmm16
mov $2, %ecx
.L1:
vmovdqa64 (%rsi), %zmm0
vpaddq (%rdx), %zmm0, %zmm0
vpsrlq $52, %zmm0, %zmm17
Expand Down Expand Up @@ -517,14 +421,105 @@ vmovdqa64 %zmm12, 256(%rdi)
vmovdqa64 %zmm13, 320(%rdi)
vmovdqa64 %zmm14, 384(%rdi)
vmovdqa64 %zmm15, 448(%rdi)
add $64, %rsi
add $64, %rdx
add $64, %rdi
sub $1, %ecx
jnz .L1
vzeroupper
ret
SIZE(mcl_c5_vaddA)
SIZE(mcl_c5_vadd)
.global PRE(mcl_c5_vsub)
PRE(mcl_c5_vsub):
TYPE(mcl_c5_vsub)
mov $4503599627370495, %rax
vpbroadcastq %rax, %zmm16
vmovdqa64 (%rsi), %zmm0
vpsubq (%rdx), %zmm0, %zmm0
vpsrlq $63, %zmm0, %zmm17
vpandq %zmm16, %zmm0, %zmm0
vmovdqa64 64(%rsi), %zmm1
vpsubq 64(%rdx), %zmm1, %zmm1
vpsubq %zmm17, %zmm1, %zmm1
vpsrlq $63, %zmm1, %zmm17
vpandq %zmm16, %zmm1, %zmm1
vmovdqa64 128(%rsi), %zmm2
vpsubq 128(%rdx), %zmm2, %zmm2
vpsubq %zmm17, %zmm2, %zmm2
vpsrlq $63, %zmm2, %zmm17
vpandq %zmm16, %zmm2, %zmm2
vmovdqa64 192(%rsi), %zmm3
vpsubq 192(%rdx), %zmm3, %zmm3
vpsubq %zmm17, %zmm3, %zmm3
vpsrlq $63, %zmm3, %zmm17
vpandq %zmm16, %zmm3, %zmm3
vmovdqa64 256(%rsi), %zmm4
vpsubq 256(%rdx), %zmm4, %zmm4
vpsubq %zmm17, %zmm4, %zmm4
vpsrlq $63, %zmm4, %zmm17
vpandq %zmm16, %zmm4, %zmm4
vmovdqa64 320(%rsi), %zmm5
vpsubq 320(%rdx), %zmm5, %zmm5
vpsubq %zmm17, %zmm5, %zmm5
vpsrlq $63, %zmm5, %zmm17
vpandq %zmm16, %zmm5, %zmm5
vmovdqa64 384(%rsi), %zmm6
vpsubq 384(%rdx), %zmm6, %zmm6
vpsubq %zmm17, %zmm6, %zmm6
vpsrlq $63, %zmm6, %zmm17
vpandq %zmm16, %zmm6, %zmm6
vmovdqa64 448(%rsi), %zmm7
vpsubq 448(%rdx), %zmm7, %zmm7
vpsubq %zmm17, %zmm7, %zmm7
vpsrlq $63, %zmm7, %zmm17
vpandq %zmm16, %zmm7, %zmm7
vpxorq %zmm8, %zmm8, %zmm8
vpcmpgtq %zmm8, %zmm17, %k1
vpaddq PRE(p)(%rip){1to8}, %zmm0, %zmm8
vpsrlq $52, %zmm8, %zmm17
vpandq %zmm16, %zmm8, %zmm8
vpaddq PRE(p)+8(%rip){1to8}, %zmm1, %zmm9
vpaddq %zmm17, %zmm9, %zmm9
vpsrlq $52, %zmm9, %zmm17
vpandq %zmm16, %zmm9, %zmm9
vpaddq PRE(p)+16(%rip){1to8}, %zmm2, %zmm10
vpaddq %zmm17, %zmm10, %zmm10
vpsrlq $52, %zmm10, %zmm17
vpandq %zmm16, %zmm10, %zmm10
vpaddq PRE(p)+24(%rip){1to8}, %zmm3, %zmm11
vpaddq %zmm17, %zmm11, %zmm11
vpsrlq $52, %zmm11, %zmm17
vpandq %zmm16, %zmm11, %zmm11
vpaddq PRE(p)+32(%rip){1to8}, %zmm4, %zmm12
vpaddq %zmm17, %zmm12, %zmm12
vpsrlq $52, %zmm12, %zmm17
vpandq %zmm16, %zmm12, %zmm12
vpaddq PRE(p)+40(%rip){1to8}, %zmm5, %zmm13
vpaddq %zmm17, %zmm13, %zmm13
vpsrlq $52, %zmm13, %zmm17
vpandq %zmm16, %zmm13, %zmm13
vpaddq PRE(p)+48(%rip){1to8}, %zmm6, %zmm14
vpaddq %zmm17, %zmm14, %zmm14
vpsrlq $52, %zmm14, %zmm17
vpandq %zmm16, %zmm14, %zmm14
vpaddq PRE(p)+56(%rip){1to8}, %zmm7, %zmm15
vpaddq %zmm17, %zmm15, %zmm15
vpsrlq $52, %zmm15, %zmm17
vpandq %zmm16, %zmm15, %zmm15
vmovdqa64 %zmm8, %zmm0{%k1}
vmovdqa64 %zmm9, %zmm1{%k1}
vmovdqa64 %zmm10, %zmm2{%k1}
vmovdqa64 %zmm11, %zmm3{%k1}
vmovdqa64 %zmm12, %zmm4{%k1}
vmovdqa64 %zmm13, %zmm5{%k1}
vmovdqa64 %zmm14, %zmm6{%k1}
vmovdqa64 %zmm15, %zmm7{%k1}
vmovdqa64 %zmm0, (%rdi)
vmovdqa64 %zmm1, 64(%rdi)
vmovdqa64 %zmm2, 128(%rdi)
vmovdqa64 %zmm3, 192(%rdi)
vmovdqa64 %zmm4, 256(%rdi)
vmovdqa64 %zmm5, 320(%rdi)
vmovdqa64 %zmm6, 384(%rdi)
vmovdqa64 %zmm7, 448(%rdi)
vzeroupper
ret
SIZE(mcl_c5_vsub)
.align 16
.global PRE(mclb_add1)
PRE(mclb_add1):
Expand Down
49 changes: 48 additions & 1 deletion src/gen_bint_x64.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,51 @@ def gen_vadd(mont, vN=1):
sub(ecx, 1)
jnz(lpL)

def gen_vsub(mont):
with FuncProc(MSM_PRE+'vsub'):
with StackFrame(3, 0, vNum=mont.N*2+2, vType=T_ZMM) as sf:
regs = list(reversed(sf.v))
W = mont.W
N = mont.N
S = 63
z = sf.p[0]
x = sf.p[1]
y = sf.p[2]
s = pops(regs, N)
t = pops(regs, N)
vmask = pops(regs, 1)[0]
c = pops(regs, 1)[0]

mov(rax, mont.mask)
vpbroadcastq(vmask, rax)

un = genUnrollFunc()

# s = x-y
for i in range(0, N):
vmovdqa64(s[i], ptr(x+i*64))
vpsubq(s[i], s[i], ptr(y+i*64))
if i > 0:
vpsubq(s[i], s[i], c);
vpsrlq(c, s[i], S)
vpandq(s[i], s[i], vmask)

vpxorq(t[0], t[0], t[0])
vpcmpgtq(k1, c, t[0]) # k1 = x<y

# t = s+p
for i in range(0, N):
vpaddq(t[i], s[i], ptr_b(rip+C_p+i*8))
if i > 0:
vpaddq(t[i], t[i], c);
vpsrlq(c, t[i], W)
vpandq(t[i], t[i], vmask)

# z = select(k1, t, s)
for i in range(N):
vmovdqa64(s[i]|k1, t[i])
un(vmovdqa64)(ptr(z), s)

def msm_data(mont):
makeLabel(C_p)
dq_(', '.join(map(hex, mont.toArray(mont.p))))
Expand All @@ -192,7 +237,9 @@ def msm_code(mont):
for vN in [1, 2]:
gen_vaddPre(mont, vN)
gen_vsubPre(mont, vN)
gen_vadd(mont, vN)

gen_vadd(mont)
gen_vsub(mont)

SUF='_fast'
param=None
Expand Down
33 changes: 30 additions & 3 deletions src/msm_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ void mcl_c5_vsubPre(Vec *, const Vec *, const Vec *);
void mcl_c5_vsubPreA(VecA *, const VecA *, const VecA *);

void mcl_c5_vadd(Vec *, const Vec *, const Vec *);
void mcl_c5_vaddA(VecA *, const VecA *, const VecA *);
void mcl_c5_vsub(Vec *, const Vec *, const Vec *);
//void mcl_c5_vaddA(VecA *, const VecA *, const VecA *);


}

Expand Down Expand Up @@ -185,6 +187,13 @@ inline void vsub(V *z, const V *x, const V *y)
tN[N-1] = vpandq(tN[N-1], G::mask());
uvselect(z, c, tN, sN);
}
#if 0
template<>
inline void vsub(Vec *z, const Vec *x, const Vec *y)
{
mcl_c5_vsub(z, x, y);
}
#endif

template<class V>
inline void vmulUnit(V *z, const V *x, const V& y)
Expand Down Expand Up @@ -1360,12 +1369,24 @@ bool initMsm(const mcl::CurveParam& cp, const mcl::msm::Func *func)

#ifdef MCL_MSM_TEST
#include <mcl/bls12_381.hpp>
#include <cybozu/test.hpp>
#include <cybozu/xorshift.hpp>
#include <cybozu/benchmark.hpp>

using namespace mcl::bn;

#if 0
#include <string.h>
int main(int argc, char *argv[])
{
Vec x[8], y[8];
memset(x, argc, sizeof(x));
memset(y, argc+1, sizeof(y));
vsub(x, x, y);
mcl::bint::dump((const uint8_t*)x, sizeof(x));
}
#else
#include <cybozu/test.hpp>

template<size_t N, int w = W>
inline void toArray(Unit x[N], const mpz_class& mx)
{
Expand Down Expand Up @@ -1651,6 +1672,10 @@ CYBOZU_TEST_AUTO(vaddPre)
for (size_t k = 0; k < N; k++) {
CYBOZU_TEST_ASSERT(isEqual(z[j].v[k], u[k]));
}
mcl_c5_vsub(u, u, y[j].v);
for (size_t k = 0; k < N; k++) {
CYBOZU_TEST_ASSERT(isEqual(x[j].v[k], u[k]));
}
}
w[0].clear();
w[1].clear();
Expand Down Expand Up @@ -1701,7 +1726,8 @@ CYBOZU_TEST_AUTO(vaddPre)
CYBOZU_BENCH_C("asm vaddPreA", C, mcl_c5_vaddPreA, za.v, za.v, xa.v);
CYBOZU_BENCH_C("asm vsubPreA", C, mcl_c5_vsubPreA, za.v, za.v, xa.v);
CYBOZU_BENCH_C("asm vadd", C, mcl_c5_vadd, z[0].v, z[0].v, x[0].v);
CYBOZU_BENCH_C("asm vaddA", C, mcl_c5_vaddA, za.v, za.v, xa.v);
CYBOZU_BENCH_C("asm vsub", C, mcl_c5_vadd, z[0].v, z[0].v, x[0].v);
// CYBOZU_BENCH_C("asm vaddA", C, mcl_c5_vaddA, za.v, za.v, xa.v);
#endif
CYBOZU_BENCH_C("vadd::Vec", C, vadd, z[0].v, z[0].v, x[0].v);
CYBOZU_BENCH_C("vsub::Vec", C, vsub, z[0].v, z[0].v, x[0].v);
Expand Down Expand Up @@ -2026,3 +2052,4 @@ CYBOZU_TEST_AUTO(mulVec)
#endif
}
#endif
#endif

0 comments on commit 0ae11a9

Please sign in to comment.