From 5dd421bab5d19f6a45083711bf79933817994ad2 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Mon, 27 Oct 2014 22:10:26 +0700 Subject: [PATCH 1/4] Rewrite mul/sqr for 32bit/64bit - interleave calculation of the lower and upper partial product ranges, and reduction - less registers needed, more opportunities for parallel ops --- src/field_10x26_impl.h | 500 ++++++++++++++++++----------------- src/field_5x52_int128_impl.h | 185 +++++++------ 2 files changed, 351 insertions(+), 334 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 33774697f9..bec2fb87e3 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -246,256 +246,262 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1 } SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t *b, uint32_t *r) { - uint64_t c = (uint64_t)a[0] * b[0]; - uint32_t t0 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[1] + - (uint64_t)a[1] * b[0]; - uint32_t t1 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[2] + - (uint64_t)a[1] * b[1] + - (uint64_t)a[2] * b[0]; - uint32_t t2 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[3] + - (uint64_t)a[1] * b[2] + - (uint64_t)a[2] * b[1] + - (uint64_t)a[3] * b[0]; - uint32_t t3 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[4] + - (uint64_t)a[1] * b[3] + - (uint64_t)a[2] * b[2] + - (uint64_t)a[3] * b[1] + - (uint64_t)a[4] * b[0]; - uint32_t t4 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[5] + - (uint64_t)a[1] * b[4] + - (uint64_t)a[2] * b[3] + - (uint64_t)a[3] * b[2] + - (uint64_t)a[4] * b[1] + - (uint64_t)a[5] * b[0]; - uint32_t t5 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[6] + - (uint64_t)a[1] * b[5] + - (uint64_t)a[2] * b[4] + - (uint64_t)a[3] * b[3] + - (uint64_t)a[4] * b[2] + - (uint64_t)a[5] * b[1] + - (uint64_t)a[6] * b[0]; - uint32_t t6 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[7] + - (uint64_t)a[1] * b[6] + - (uint64_t)a[2] * b[5] + - (uint64_t)a[3] * b[4] + - (uint64_t)a[4] * b[3] + - (uint64_t)a[5] * b[2] + - (uint64_t)a[6] * b[1] + - (uint64_t)a[7] * b[0]; - uint32_t t7 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[8] + - (uint64_t)a[1] * b[7] + - (uint64_t)a[2] * b[6] + - (uint64_t)a[3] * b[5] + - (uint64_t)a[4] * b[4] + - (uint64_t)a[5] * b[3] + - (uint64_t)a[6] * b[2] + - (uint64_t)a[7] * b[1] + - (uint64_t)a[8] * b[0]; - uint32_t t8 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[0] * b[9] + - (uint64_t)a[1] * b[8] + - (uint64_t)a[2] * b[7] + - (uint64_t)a[3] * b[6] + - (uint64_t)a[4] * b[5] + - (uint64_t)a[5] * b[4] + - (uint64_t)a[6] * b[3] + - (uint64_t)a[7] * b[2] + - (uint64_t)a[8] * b[1] + - (uint64_t)a[9] * b[0]; - uint32_t t9 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[1] * b[9] + - (uint64_t)a[2] * b[8] + - (uint64_t)a[3] * b[7] + - (uint64_t)a[4] * b[6] + - (uint64_t)a[5] * b[5] + - (uint64_t)a[6] * b[4] + - (uint64_t)a[7] * b[3] + - (uint64_t)a[8] * b[2] + - (uint64_t)a[9] * b[1]; - uint32_t t10 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[2] * b[9] + - (uint64_t)a[3] * b[8] + - (uint64_t)a[4] * b[7] + - (uint64_t)a[5] * b[6] + - (uint64_t)a[6] * b[5] + - (uint64_t)a[7] * b[4] + - (uint64_t)a[8] * b[3] + - (uint64_t)a[9] * b[2]; - uint32_t t11 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[3] * b[9] + - (uint64_t)a[4] * b[8] + - (uint64_t)a[5] * b[7] + - (uint64_t)a[6] * b[6] + - (uint64_t)a[7] * b[5] + - (uint64_t)a[8] * b[4] + - (uint64_t)a[9] * b[3]; - uint32_t t12 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[4] * b[9] + - (uint64_t)a[5] * b[8] + - (uint64_t)a[6] * b[7] + - (uint64_t)a[7] * b[6] + - (uint64_t)a[8] * b[5] + - (uint64_t)a[9] * b[4]; - uint32_t t13 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[5] * b[9] + - (uint64_t)a[6] * b[8] + - (uint64_t)a[7] * b[7] + - (uint64_t)a[8] * b[6] + - (uint64_t)a[9] * b[5]; - uint32_t t14 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[6] * b[9] + - (uint64_t)a[7] * b[8] + - (uint64_t)a[8] * b[7] + - (uint64_t)a[9] * b[6]; - uint32_t t15 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[7] * b[9] + - (uint64_t)a[8] * b[8] + - (uint64_t)a[9] * b[7]; - uint32_t t16 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[8] * b[9] + - (uint64_t)a[9] * b[8]; - uint32_t t17 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[9] * b[9]; - uint32_t t18 = c & 0x3FFFFFFUL; c = c >> 26; - uint32_t t19 = c; - - c = t0 + (uint64_t)t10 * 0x3D10UL; - t0 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t1 + (uint64_t)t10*0x400UL + (uint64_t)t11 * 0x3D10UL; - t1 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t2 + (uint64_t)t11*0x400UL + (uint64_t)t12 * 0x3D10UL; - t2 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t3 + (uint64_t)t12*0x400UL + (uint64_t)t13 * 0x3D10UL; - r[3] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t4 + (uint64_t)t13*0x400UL + (uint64_t)t14 * 0x3D10UL; - r[4] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t5 + (uint64_t)t14*0x400UL + (uint64_t)t15 * 0x3D10UL; - r[5] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t6 + (uint64_t)t15*0x400UL + (uint64_t)t16 * 0x3D10UL; - r[6] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t7 + (uint64_t)t16*0x400UL + (uint64_t)t17 * 0x3D10UL; - r[7] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t8 + (uint64_t)t17*0x400UL + (uint64_t)t18 * 0x3D10UL; - r[8] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t9 + (uint64_t)t18*0x400UL + (uint64_t)t19 * 0x1000003D10ULL; - r[9] = c & 0x03FFFFFUL; c = c >> 22; - uint64_t d = t0 + c * 0x3D1UL; - r[0] = d & 0x3FFFFFFUL; d = d >> 26; - d = d + t1 + c*0x40; - r[1] = d & 0x3FFFFFFUL; d = d >> 26; - r[2] = t2 + d; + + const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL; + + uint64_t c, d; + + d = (uint64_t)a[0] * b[9] + + (uint64_t)a[1] * b[8] + + (uint64_t)a[2] * b[7] + + (uint64_t)a[3] * b[6] + + (uint64_t)a[4] * b[5] + + (uint64_t)a[5] * b[4] + + (uint64_t)a[6] * b[3] + + (uint64_t)a[7] * b[2] + + (uint64_t)a[8] * b[1] + + (uint64_t)a[9] * b[0]; + uint32_t t9 = d & M; d >>= 26; + + c = (uint64_t)a[0] * b[0]; + d += (uint64_t)a[1] * b[9] + + (uint64_t)a[2] * b[8] + + (uint64_t)a[3] * b[7] + + (uint64_t)a[4] * b[6] + + (uint64_t)a[5] * b[5] + + (uint64_t)a[6] * b[4] + + (uint64_t)a[7] * b[3] + + (uint64_t)a[8] * b[2] + + (uint64_t)a[9] * b[1]; + uint64_t u0 = d & M; d >>= 26; c += u0 * R0; + uint32_t t0 = c & M; c >>= 26; c += u0 * R1; + + c += (uint64_t)a[0] * b[1] + + (uint64_t)a[1] * b[0]; + d += (uint64_t)a[2] * b[9] + + (uint64_t)a[3] * b[8] + + (uint64_t)a[4] * b[7] + + (uint64_t)a[5] * b[6] + + (uint64_t)a[6] * b[5] + + (uint64_t)a[7] * b[4] + + (uint64_t)a[8] * b[3] + + (uint64_t)a[9] * b[2]; + uint64_t u1 = d & M; d >>= 26; c += u1 * R0; + uint32_t t1 = c & M; c >>= 26; c += u1 * R1; + + c += (uint64_t)a[0] * b[2] + + (uint64_t)a[1] * b[1] + + (uint64_t)a[2] * b[0]; + d += (uint64_t)a[3] * b[9] + + (uint64_t)a[4] * b[8] + + (uint64_t)a[5] * b[7] + + (uint64_t)a[6] * b[6] + + (uint64_t)a[7] * b[5] + + (uint64_t)a[8] * b[4] + + (uint64_t)a[9] * b[3]; + uint64_t u2 = d & M; d >>= 26; c += u2 * R0; + uint32_t t2 = c & M; c >>= 26; c += u2 * R1; + + c += (uint64_t)a[0] * b[3] + + (uint64_t)a[1] * b[2] + + (uint64_t)a[2] * b[1] + + (uint64_t)a[3] * b[0]; + d += (uint64_t)a[4] * b[9] + + (uint64_t)a[5] * b[8] + + (uint64_t)a[6] * b[7] + + (uint64_t)a[7] * b[6] + + (uint64_t)a[8] * b[5] + + (uint64_t)a[9] * b[4]; + uint64_t u3 = d & M; d >>= 26; c += u3 * R0; + uint32_t t3 = c & M; c >>= 26; c += u3 * R1; + + c += (uint64_t)a[0] * b[4] + + (uint64_t)a[1] * b[3] + + (uint64_t)a[2] * b[2] + + (uint64_t)a[3] * b[1] + + (uint64_t)a[4] * b[0]; + d += (uint64_t)a[5] * b[9] + + (uint64_t)a[6] * b[8] + + (uint64_t)a[7] * b[7] + + (uint64_t)a[8] * b[6] + + (uint64_t)a[9] * b[5]; + uint64_t u4 = d & M; d >>= 26; c += u4 * R0; + uint32_t t4 = c & M; c >>= 26; c += u4 * R1; + + c += (uint64_t)a[0] * b[5] + + (uint64_t)a[1] * b[4] + + (uint64_t)a[2] * b[3] + + (uint64_t)a[3] * b[2] + + (uint64_t)a[4] * b[1] + + (uint64_t)a[5] * b[0]; + d += (uint64_t)a[6] * b[9] + + (uint64_t)a[7] * b[8] + + (uint64_t)a[8] * b[7] + + (uint64_t)a[9] * b[6]; + uint64_t u5 = d & M; d >>= 26; c += u5 * R0; + uint32_t t5 = c & M; c >>= 26; c += u5 * R1; + + c += (uint64_t)a[0] * b[6] + + (uint64_t)a[1] * b[5] + + (uint64_t)a[2] * b[4] + + (uint64_t)a[3] * b[3] + + (uint64_t)a[4] * b[2] + + (uint64_t)a[5] * b[1] + + (uint64_t)a[6] * b[0]; + d += (uint64_t)a[7] * b[9] + + (uint64_t)a[8] * b[8] + + (uint64_t)a[9] * b[7]; + uint64_t u6 = d & M; d >>= 26; c += u6 * R0; + uint32_t t6 = c & M; c >>= 26; c += u6 * R1; + + c += (uint64_t)a[0] * b[7] + + (uint64_t)a[1] * b[6] + + (uint64_t)a[2] * b[5] + + (uint64_t)a[3] * b[4] + + (uint64_t)a[4] * b[3] + + (uint64_t)a[5] * b[2] + + (uint64_t)a[6] * b[1] + + (uint64_t)a[7] * b[0]; + d += (uint64_t)a[8] * b[9] + + (uint64_t)a[9] * b[8]; + uint64_t u7 = d & M; d >>= 26; c += u7 * R0; + uint32_t t7 = c & M; c >>= 26; c += u7 * R1; + + c += (uint64_t)a[0] * b[8] + + (uint64_t)a[1] * b[7] + + (uint64_t)a[2] * b[6] + + (uint64_t)a[3] * b[5] + + (uint64_t)a[4] * b[4] + + (uint64_t)a[5] * b[3] + + (uint64_t)a[6] * b[2] + + (uint64_t)a[7] * b[1] + + (uint64_t)a[8] * b[0]; + d += (uint64_t)a[9] * b[9]; + uint64_t u8 = d & M; d >>= 26; c += u8 * R0; + + r[3] = t3; + r[4] = t4; + r[5] = t5; + r[6] = t6; + r[7] = t7; + + r[8] = c & M; c >>= 26; c += u8 * R1; + c += d * R0 + t9; + r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4); + + d = c * (R0 >> 4) + t0; + r[0] = d & M; d >>= 26; + d += c * (R1 >> 4) + t1; + r[1] = d & M; d >>= 26; + d += t2; + r[2] = d; } SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t *r) { - uint64_t c = (uint64_t)a[0] * a[0]; - uint32_t t0 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[1]; - uint32_t t1 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[2] + - (uint64_t)a[1] * a[1]; - uint32_t t2 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[3] + - (uint64_t)(a[1]*2) * a[2]; - uint32_t t3 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[4] + - (uint64_t)(a[1]*2) * a[3] + - (uint64_t)a[2] * a[2]; - uint32_t t4 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[5] + - (uint64_t)(a[1]*2) * a[4] + - (uint64_t)(a[2]*2) * a[3]; - uint32_t t5 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[6] + - (uint64_t)(a[1]*2) * a[5] + - (uint64_t)(a[2]*2) * a[4] + - (uint64_t)a[3] * a[3]; - uint32_t t6 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[7] + - (uint64_t)(a[1]*2) * a[6] + - (uint64_t)(a[2]*2) * a[5] + - (uint64_t)(a[3]*2) * a[4]; - uint32_t t7 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[8] + - (uint64_t)(a[1]*2) * a[7] + - (uint64_t)(a[2]*2) * a[6] + - (uint64_t)(a[3]*2) * a[5] + - (uint64_t)a[4] * a[4]; - uint32_t t8 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[0]*2) * a[9] + - (uint64_t)(a[1]*2) * a[8] + - (uint64_t)(a[2]*2) * a[7] + - (uint64_t)(a[3]*2) * a[6] + - (uint64_t)(a[4]*2) * a[5]; - uint32_t t9 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[1]*2) * a[9] + - (uint64_t)(a[2]*2) * a[8] + - (uint64_t)(a[3]*2) * a[7] + - (uint64_t)(a[4]*2) * a[6] + - (uint64_t)a[5] * a[5]; - uint32_t t10 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[2]*2) * a[9] + - (uint64_t)(a[3]*2) * a[8] + - (uint64_t)(a[4]*2) * a[7] + - (uint64_t)(a[5]*2) * a[6]; - uint32_t t11 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[3]*2) * a[9] + - (uint64_t)(a[4]*2) * a[8] + - (uint64_t)(a[5]*2) * a[7] + - (uint64_t)a[6] * a[6]; - uint32_t t12 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[4]*2) * a[9] + - (uint64_t)(a[5]*2) * a[8] + - (uint64_t)(a[6]*2) * a[7]; - uint32_t t13 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[5]*2) * a[9] + - (uint64_t)(a[6]*2) * a[8] + - (uint64_t)a[7] * a[7]; - uint32_t t14 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[6]*2) * a[9] + - (uint64_t)(a[7]*2) * a[8]; - uint32_t t15 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[7]*2) * a[9] + - (uint64_t)a[8] * a[8]; - uint32_t t16 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)(a[8]*2) * a[9]; - uint32_t t17 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + (uint64_t)a[9] * a[9]; - uint32_t t18 = c & 0x3FFFFFFUL; c = c >> 26; - uint32_t t19 = c; - - c = t0 + (uint64_t)t10 * 0x3D10UL; - t0 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t1 + (uint64_t)t10*0x400UL + (uint64_t)t11 * 0x3D10UL; - t1 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t2 + (uint64_t)t11*0x400UL + (uint64_t)t12 * 0x3D10UL; - t2 = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t3 + (uint64_t)t12*0x400UL + (uint64_t)t13 * 0x3D10UL; - r[3] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t4 + (uint64_t)t13*0x400UL + (uint64_t)t14 * 0x3D10UL; - r[4] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t5 + (uint64_t)t14*0x400UL + (uint64_t)t15 * 0x3D10UL; - r[5] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t6 + (uint64_t)t15*0x400UL + (uint64_t)t16 * 0x3D10UL; - r[6] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t7 + (uint64_t)t16*0x400UL + (uint64_t)t17 * 0x3D10UL; - r[7] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t8 + (uint64_t)t17*0x400UL + (uint64_t)t18 * 0x3D10UL; - r[8] = c & 0x3FFFFFFUL; c = c >> 26; - c = c + t9 + (uint64_t)t18*0x400UL + (uint64_t)t19 * 0x1000003D10ULL; - r[9] = c & 0x03FFFFFUL; c = c >> 22; - uint64_t d = t0 + c * 0x3D1UL; - r[0] = d & 0x3FFFFFFUL; d = d >> 26; - d = d + t1 + c*0x40; - r[1] = d & 0x3FFFFFFUL; d = d >> 26; - r[2] = t2 + d; + + const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL; + + uint64_t c, d; + + d = (uint64_t)(a[0]*2) * a[9] + + (uint64_t)(a[1]*2) * a[8] + + (uint64_t)(a[2]*2) * a[7] + + (uint64_t)(a[3]*2) * a[6] + + (uint64_t)(a[4]*2) * a[5]; + uint32_t t9 = d & M; d >>= 26; + + c = (uint64_t)a[0] * a[0]; + d += (uint64_t)(a[1]*2) * a[9] + + (uint64_t)(a[2]*2) * a[8] + + (uint64_t)(a[3]*2) * a[7] + + (uint64_t)(a[4]*2) * a[6] + + (uint64_t)a[5] * a[5]; + uint64_t u0 = d & M; d >>= 26; c += u0 * R0; + uint32_t t0 = c & M; c >>= 26; c += u0 * R1; + + c += (uint64_t)(a[0]*2) * a[1]; + d += (uint64_t)(a[2]*2) * a[9] + + (uint64_t)(a[3]*2) * a[8] + + (uint64_t)(a[4]*2) * a[7] + + (uint64_t)(a[5]*2) * a[6]; + uint64_t u1 = d & M; d >>= 26; c += u1 * R0; + uint32_t t1 = c & M; c >>= 26; c += u1 * R1; + + c += (uint64_t)(a[0]*2) * a[2] + + (uint64_t)a[1] * a[1]; + d += (uint64_t)(a[3]*2) * a[9] + + (uint64_t)(a[4]*2) * a[8] + + (uint64_t)(a[5]*2) * a[7] + + (uint64_t)a[6] * a[6]; + uint64_t u2 = d & M; d >>= 26; c += u2 * R0; + uint32_t t2 = c & M; c >>= 26; c += u2 * R1; + + c += (uint64_t)(a[0]*2) * a[3] + + (uint64_t)(a[1]*2) * a[2]; + d += (uint64_t)(a[4]*2) * a[9] + + (uint64_t)(a[5]*2) * a[8] + + (uint64_t)(a[6]*2) * a[7]; + uint64_t u3 = d & M; d >>= 26; c += u3 * R0; + uint32_t t3 = c & M; c >>= 26; c += u3 * R1; + + c += (uint64_t)(a[0]*2) * a[4] + + (uint64_t)(a[1]*2) * a[3] + + (uint64_t)a[2] * a[2]; + d += (uint64_t)(a[5]*2) * a[9] + + (uint64_t)(a[6]*2) * a[8] + + (uint64_t)a[7] * a[7]; + uint64_t u4 = d & M; d >>= 26; c += u4 * R0; + uint32_t t4 = c & M; c >>= 26; c += u4 * R1; + + c += (uint64_t)(a[0]*2) * a[5] + + (uint64_t)(a[1]*2) * a[4] + + (uint64_t)(a[2]*2) * a[3]; + d += (uint64_t)(a[6]*2) * a[9] + + (uint64_t)(a[7]*2) * a[8]; + uint64_t u5 = d & M; d >>= 26; c += u5 * R0; + uint32_t t5 = c & M; c >>= 26; c += u5 * R1; + + c += (uint64_t)(a[0]*2) * a[6] + + (uint64_t)(a[1]*2) * a[5] + + (uint64_t)(a[2]*2) * a[4] + + (uint64_t)a[3] * a[3]; + d += (uint64_t)(a[7]*2) * a[9] + + (uint64_t)a[8] * a[8]; + uint64_t u6 = d & M; d >>= 26; c += u6 * R0; + uint32_t t6 = c & M; c >>= 26; c += u6 * R1; + + c += (uint64_t)(a[0]*2) * a[7] + + (uint64_t)(a[1]*2) * a[6] + + (uint64_t)(a[2]*2) * a[5] + + (uint64_t)(a[3]*2) * a[4]; + d += (uint64_t)(a[8]*2) * a[9]; + uint64_t u7 = d & M; d >>= 26; c += u7 * R0; + uint32_t t7 = c & M; c >>= 26; c += u7 * R1; + + c += (uint64_t)(a[0]*2) * a[8] + + (uint64_t)(a[1]*2) * a[7] + + (uint64_t)(a[2]*2) * a[6] + + (uint64_t)(a[3]*2) * a[5] + + (uint64_t)a[4] * a[4]; + d += (uint64_t)a[9] * a[9]; + uint64_t u8 = d & M; d >>= 26; c += u8 * R0; + + r[3] = t3; + r[4] = t4; + r[5] = t5; + r[6] = t6; + r[7] = t7; + + r[8] = c & M; c >>= 26; c += u8 * R1; + c += d * R0 + t9; + r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4); + + d = c * (R0 >> 4) + t0; + r[0] = d & M; d >>= 26; + d += c * (R1 >> 4) + t1; + r[1] = d & M; d >>= 26; + d += t2; + r[2] = d; } diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h index 6a131f7588..9d40c9d19b 100644 --- a/src/field_5x52_int128_impl.h +++ b/src/field_5x52_int128_impl.h @@ -8,98 +8,109 @@ #include SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r) { - __int128 c = (__int128)a[0] * b[0]; - uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0 - c = c + (__int128)a[0] * b[1] + - (__int128)a[1] * b[0]; - uint64_t t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 20000000000000BF - c = c + (__int128)a[0] * b[2] + - (__int128)a[1] * b[1] + - (__int128)a[2] * b[0]; - uint64_t t2 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 30000000000001A0 - c = c + (__int128)a[0] * b[3] + - (__int128)a[1] * b[2] + - (__int128)a[2] * b[1] + - (__int128)a[3] * b[0]; - uint64_t t3 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 4000000000000280 - c = c + (__int128)a[0] * b[4] + - (__int128)a[1] * b[3] + - (__int128)a[2] * b[2] + - (__int128)a[3] * b[1] + - (__int128)a[4] * b[0]; - uint64_t t4 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 320000000000037E - c = c + (__int128)a[1] * b[4] + - (__int128)a[2] * b[3] + - (__int128)a[3] * b[2] + - (__int128)a[4] * b[1]; - uint64_t t5 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 22000000000002BE - c = c + (__int128)a[2] * b[4] + - (__int128)a[3] * b[3] + - (__int128)a[4] * b[2]; - uint64_t t6 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 12000000000001DE - c = c + (__int128)a[3] * b[4] + - (__int128)a[4] * b[3]; - uint64_t t7 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 02000000000000FE - c = c + (__int128)a[4] * b[4]; - uint64_t t8 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 001000000000001E - uint64_t t9 = c; - - c = t0 + (__int128)t5 * 0x1000003D10ULL; - t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 - c = c + t1 + (__int128)t6 * 0x1000003D10ULL; - t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 - c = c + t2 + (__int128)t7 * 0x1000003D10ULL; - r[2] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 - c = c + t3 + (__int128)t8 * 0x1000003D10ULL; - r[3] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 - c = c + t4 + (__int128)t9 * 0x1000003D10ULL; - r[4] = c & 0x0FFFFFFFFFFFFULL; c = c >> 48; // c max 000001000003D110 - c = t0 + (__int128)c * 0x1000003D1ULL; - r[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008 - r[1] = t1 + c; + const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; + + __int128 c, d; + + d = (__int128)a[0] * b[3] + + (__int128)a[1] * b[2] + + (__int128)a[2] * b[1] + + (__int128)a[3] * b[0]; + c = (__int128)a[4] * b[4]; + d += (c & M) * R; c >>= 52; + uint64_t t3 = d & M; d >>= 52; + + d += (__int128)a[0] * b[4] + + (__int128)a[1] * b[3] + + (__int128)a[2] * b[2] + + (__int128)a[3] * b[1] + + (__int128)a[4] * b[0]; + d += c * R; + uint64_t t4 = d & M; d >>= 52; + uint64_t tx = (t4 >> 48); t4 &= (M >> 4); + + c = (__int128)a[0] * b[0]; + d += (__int128)a[1] * b[4] + + (__int128)a[2] * b[3] + + (__int128)a[3] * b[2] + + (__int128)a[4] * b[1]; + uint64_t u0 = d & M; d >>= 52; + u0 = (u0 << 4) | tx; + c += (__int128)u0 * (R >> 4); + uint64_t t0 = c & M; c >>= 52; + + c += (__int128)a[0] * b[1] + + (__int128)a[1] * b[0]; + d += (__int128)a[2] * b[4] + + (__int128)a[3] * b[3] + + (__int128)a[4] * b[2]; + c += (d & M) * R; d >>= 52; + uint64_t t1 = c & M; c >>= 52; + + c += (__int128)a[0] * b[2] + + (__int128)a[1] * b[1] + + (__int128)a[2] * b[0]; + d += (__int128)a[3] * b[4] + + (__int128)a[4] * b[3]; + c += (d & M) * R; d >>= 52; + + r[0] = t0; + r[1] = t1; + r[2] = c & M; c >>= 52; + c += d * R + t3;; + r[3] = c & M; c >>= 52; + c += t4; + r[4] = c; } SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) { - __int128 c = (__int128)a[0] * a[0]; - uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0 - c = c + (__int128)(a[0]*2) * a[1]; - uint64_t t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 20000000000000BF - c = c + (__int128)(a[0]*2) * a[2] + - (__int128)a[1] * a[1]; - uint64_t t2 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 30000000000001A0 - c = c + (__int128)(a[0]*2) * a[3] + - (__int128)(a[1]*2) * a[2]; - uint64_t t3 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 4000000000000280 - c = c + (__int128)(a[0]*2) * a[4] + - (__int128)(a[1]*2) * a[3] + - (__int128)a[2] * a[2]; - uint64_t t4 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 320000000000037E - c = c + (__int128)(a[1]*2) * a[4] + - (__int128)(a[2]*2) * a[3]; - uint64_t t5 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 22000000000002BE - c = c + (__int128)(a[2]*2) * a[4] + - (__int128)a[3] * a[3]; - uint64_t t6 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 12000000000001DE - c = c + (__int128)(a[3]*2) * a[4]; - uint64_t t7 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 02000000000000FE - c = c + (__int128)a[4] * a[4]; - uint64_t t8 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 001000000000001E - uint64_t t9 = c; - c = t0 + (__int128)t5 * 0x1000003D10ULL; - t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 - c = c + t1 + (__int128)t6 * 0x1000003D10ULL; - t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 - c = c + t2 + (__int128)t7 * 0x1000003D10ULL; - r[2] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 - c = c + t3 + (__int128)t8 * 0x1000003D10ULL; - r[3] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 - c = c + t4 + (__int128)t9 * 0x1000003D10ULL; - r[4] = c & 0x0FFFFFFFFFFFFULL; c = c >> 48; // c max 000001000003D110 - c = t0 + (__int128)c * 0x1000003D1ULL; - r[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008 - r[1] = t1 + c; + const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; + + __int128 c, d; + + uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; + + d = (__int128)(a0*2) * a3 + + (__int128)(a1*2) * a2; + c = (__int128)a4 * a4; + d += (c & M) * R; c >>= 52; + uint64_t t3 = d & M; d >>= 52; + + a4 *= 2; + d += (__int128)a0 * a4 + + (__int128)(a1*2) * a3 + + (__int128)a2 * a2; + d += c * R; + uint64_t t4 = d & M; d >>= 52; + uint64_t tx = (t4 >> 48); t4 &= (M >> 4); + + c = (__int128)a0 * a0; + d += (__int128)a1 * a4 + + (__int128)(a2*2) * a3; + uint64_t u0 = d & M; d >>= 52; + u0 = (u0 << 4) | tx; + c += (__int128)u0 * (R >> 4); + r[0] = c & M; c >>= 52; + + a0 *= 2; + c += (__int128)a0 * a1; + d += (__int128)a2 * a4 + + (__int128)a3 * a3; + c += (d & M) * R; d >>= 52; + r[1] = c & M; c >>= 52; + + c += (__int128)a0 * a2 + + (__int128)a1 * a1; + d += (__int128)a3 * a4; + c += (d & M) * R; d >>= 52; + r[2] = c & M; c >>= 52; + + c += d * R + t3;; + r[3] = c & M; c >>= 52; + c += t4; + r[4] = c; } #endif From fa0d620668730e090b31cd1cd0ae3dd42d81ff2a Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Thu, 13 Nov 2014 07:00:44 -0800 Subject: [PATCH 2/4] Add equalities relating input and output variables --- src/field_10x26_impl.h | 132 +++++++++++++++++++++++++++++++++++ src/field_5x52_int128_impl.h | 62 ++++++++++++++++ 2 files changed, 194 insertions(+) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index bec2fb87e3..1e78599c90 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -248,6 +248,9 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1 SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t *b, uint32_t *r) { const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL; + // [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n. + // px is a shorthand for sum(a[i]*b[x-i], i=0..x). + // Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0]. uint64_t c, d; @@ -261,9 +264,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[2] + (uint64_t)a[8] * b[1] + (uint64_t)a[9] * b[0]; + // [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] uint32_t t9 = d & M; d >>= 26; + // [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] c = (uint64_t)a[0] * b[0]; + // [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0] d += (uint64_t)a[1] * b[9] + (uint64_t)a[2] * b[8] + (uint64_t)a[3] * b[7] @@ -273,11 +279,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[3] + (uint64_t)a[8] * b[2] + (uint64_t)a[9] * b[1]; + // [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] uint64_t u0 = d & M; d >>= 26; c += u0 * R0; + // [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] uint32_t t0 = c & M; c >>= 26; c += u0 * R1; + // [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] + // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0] c += (uint64_t)a[0] * b[1] + (uint64_t)a[1] * b[0]; + // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0] d += (uint64_t)a[2] * b[9] + (uint64_t)a[3] * b[8] + (uint64_t)a[4] * b[7] @@ -286,12 +297,17 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[4] + (uint64_t)a[8] * b[3] + (uint64_t)a[9] * b[2]; + // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] uint64_t u1 = d & M; d >>= 26; c += u1 * R0; + // [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] uint32_t t1 = c & M; c >>= 26; c += u1 * R1; + // [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] + // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] c += (uint64_t)a[0] * b[2] + (uint64_t)a[1] * b[1] + (uint64_t)a[2] * b[0]; + // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] d += (uint64_t)a[3] * b[9] + (uint64_t)a[4] * b[8] + (uint64_t)a[5] * b[7] @@ -299,34 +315,48 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[5] + (uint64_t)a[8] * b[4] + (uint64_t)a[9] * b[3]; + // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] uint64_t u2 = d & M; d >>= 26; c += u2 * R0; + // [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] uint32_t t2 = c & M; c >>= 26; c += u2 * R1; + // [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] + // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] c += (uint64_t)a[0] * b[3] + (uint64_t)a[1] * b[2] + (uint64_t)a[2] * b[1] + (uint64_t)a[3] * b[0]; + // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] d += (uint64_t)a[4] * b[9] + (uint64_t)a[5] * b[8] + (uint64_t)a[6] * b[7] + (uint64_t)a[7] * b[6] + (uint64_t)a[8] * b[5] + (uint64_t)a[9] * b[4]; + // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] uint64_t u3 = d & M; d >>= 26; c += u3 * R0; + // [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] uint32_t t3 = c & M; c >>= 26; c += u3 * R1; + // [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] + // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] c += (uint64_t)a[0] * b[4] + (uint64_t)a[1] * b[3] + (uint64_t)a[2] * b[2] + (uint64_t)a[3] * b[1] + (uint64_t)a[4] * b[0]; + // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] d += (uint64_t)a[5] * b[9] + (uint64_t)a[6] * b[8] + (uint64_t)a[7] * b[7] + (uint64_t)a[8] * b[6] + (uint64_t)a[9] * b[5]; + // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] uint64_t u4 = d & M; d >>= 26; c += u4 * R0; + // [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] uint32_t t4 = c & M; c >>= 26; c += u4 * R1; + // [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] c += (uint64_t)a[0] * b[5] + (uint64_t)a[1] * b[4] @@ -334,12 +364,17 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[3] * b[2] + (uint64_t)a[4] * b[1] + (uint64_t)a[5] * b[0]; + // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[6] * b[9] + (uint64_t)a[7] * b[8] + (uint64_t)a[8] * b[7] + (uint64_t)a[9] * b[6]; + // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] uint64_t u5 = d & M; d >>= 26; c += u5 * R0; + // [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] uint32_t t5 = c & M; c >>= 26; c += u5 * R1; + // [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] c += (uint64_t)a[0] * b[6] + (uint64_t)a[1] * b[5] @@ -348,11 +383,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[4] * b[2] + (uint64_t)a[5] * b[1] + (uint64_t)a[6] * b[0]; + // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[7] * b[9] + (uint64_t)a[8] * b[8] + (uint64_t)a[9] * b[7]; + // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] uint64_t u6 = d & M; d >>= 26; c += u6 * R0; + // [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] uint32_t t6 = c & M; c >>= 26; c += u6 * R1; + // [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] c += (uint64_t)a[0] * b[7] + (uint64_t)a[1] * b[6] @@ -362,10 +402,15 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[5] * b[2] + (uint64_t)a[6] * b[1] + (uint64_t)a[7] * b[0]; + // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[8] * b[9] + (uint64_t)a[9] * b[8]; + // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] uint64_t u7 = d & M; d >>= 26; c += u7 * R0; + // [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] uint32_t t7 = c & M; c >>= 26; c += u7 * R1; + // [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] c += (uint64_t)a[0] * b[8] + (uint64_t)a[1] * b[7] @@ -376,30 +421,54 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[6] * b[2] + (uint64_t)a[7] * b[1] + (uint64_t)a[8] * b[0]; + // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[9] * b[9]; + // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] uint64_t u8 = d & M; d >>= 26; c += u8 * R0; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[3] = t3; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[4] = t4; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[5] = t5; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[6] = t6; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[7] = t7; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[8] = c & M; c >>= 26; c += u8 * R1; + // [d u8 0 0 0 0 0 0 0 0 t9+c-u8*R1 r8-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] c += d * R0 + t9; + // [d 0 0 0 0 0 0 0 0 0 c-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4); + // [d 0 0 0 0 0 0 0 0 r9+((c-d*R1<<4)<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 0 -d*R1 r9+(c<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] + // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d = c * (R0 >> 4) + t0; + // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[0] = d & M; d >>= 26; + // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += c * (R1 >> 4) + t1; + // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*R1>>4 r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] + // [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[1] = d & M; d >>= 26; + // [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += t2; + // [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[2] = d; + // [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] } SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t *r) { const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL; + // [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n. + // px is a shorthand for sum(a[i]*a[x-i], i=0..x). + // Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0]. uint64_t c, d; @@ -408,100 +477,163 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t + (uint64_t)(a[2]*2) * a[7] + (uint64_t)(a[3]*2) * a[6] + (uint64_t)(a[4]*2) * a[5]; + // [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] uint32_t t9 = d & M; d >>= 26; + // [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] c = (uint64_t)a[0] * a[0]; + // [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0] d += (uint64_t)(a[1]*2) * a[9] + (uint64_t)(a[2]*2) * a[8] + (uint64_t)(a[3]*2) * a[7] + (uint64_t)(a[4]*2) * a[6] + (uint64_t)a[5] * a[5]; + // [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] uint64_t u0 = d & M; d >>= 26; c += u0 * R0; + // [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] uint32_t t0 = c & M; c >>= 26; c += u0 * R1; + // [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] + // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0] c += (uint64_t)(a[0]*2) * a[1]; + // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0] d += (uint64_t)(a[2]*2) * a[9] + (uint64_t)(a[3]*2) * a[8] + (uint64_t)(a[4]*2) * a[7] + (uint64_t)(a[5]*2) * a[6]; + // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] uint64_t u1 = d & M; d >>= 26; c += u1 * R0; + // [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] uint32_t t1 = c & M; c >>= 26; c += u1 * R1; + // [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] + // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] c += (uint64_t)(a[0]*2) * a[2] + (uint64_t)a[1] * a[1]; + // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] d += (uint64_t)(a[3]*2) * a[9] + (uint64_t)(a[4]*2) * a[8] + (uint64_t)(a[5]*2) * a[7] + (uint64_t)a[6] * a[6]; + // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] uint64_t u2 = d & M; d >>= 26; c += u2 * R0; + // [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] uint32_t t2 = c & M; c >>= 26; c += u2 * R1; + // [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] + // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[3] + (uint64_t)(a[1]*2) * a[2]; + // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] d += (uint64_t)(a[4]*2) * a[9] + (uint64_t)(a[5]*2) * a[8] + (uint64_t)(a[6]*2) * a[7]; + // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] uint64_t u3 = d & M; d >>= 26; c += u3 * R0; + // [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] uint32_t t3 = c & M; c >>= 26; c += u3 * R1; + // [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] + // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[4] + (uint64_t)(a[1]*2) * a[3] + (uint64_t)a[2] * a[2]; + // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] d += (uint64_t)(a[5]*2) * a[9] + (uint64_t)(a[6]*2) * a[8] + (uint64_t)a[7] * a[7]; + // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] uint64_t u4 = d & M; d >>= 26; c += u4 * R0; + // [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] uint32_t t4 = c & M; c >>= 26; c += u4 * R1; + // [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[5] + (uint64_t)(a[1]*2) * a[4] + (uint64_t)(a[2]*2) * a[3]; + // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] d += (uint64_t)(a[6]*2) * a[9] + (uint64_t)(a[7]*2) * a[8]; + // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] uint64_t u5 = d & M; d >>= 26; c += u5 * R0; + // [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] uint32_t t5 = c & M; c >>= 26; c += u5 * R1; + // [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[6] + (uint64_t)(a[1]*2) * a[5] + (uint64_t)(a[2]*2) * a[4] + (uint64_t)a[3] * a[3]; + // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)(a[7]*2) * a[9] + (uint64_t)a[8] * a[8]; + // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] uint64_t u6 = d & M; d >>= 26; c += u6 * R0; + // [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] uint32_t t6 = c & M; c >>= 26; c += u6 * R1; + // [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[7] + (uint64_t)(a[1]*2) * a[6] + (uint64_t)(a[2]*2) * a[5] + (uint64_t)(a[3]*2) * a[4]; + // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)(a[8]*2) * a[9]; + // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] uint64_t u7 = d & M; d >>= 26; c += u7 * R0; + // [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] uint32_t t7 = c & M; c >>= 26; c += u7 * R1; + // [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[8] + (uint64_t)(a[1]*2) * a[7] + (uint64_t)(a[2]*2) * a[6] + (uint64_t)(a[3]*2) * a[5] + (uint64_t)a[4] * a[4]; + // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[9] * a[9]; + // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] uint64_t u8 = d & M; d >>= 26; c += u8 * R0; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[3] = t3; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[4] = t4; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[5] = t5; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[6] = t6; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[7] = t7; + // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[8] = c & M; c >>= 26; c += u8 * R1; + // [d u8 0 0 0 0 0 0 0 0 t9+c-u8*R1 r8-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] c += d * R0 + t9; + // [d 0 0 0 0 0 0 0 0 0 c-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4); + // [d 0 0 0 0 0 0 0 0 r9+((c-d*R1<<4)<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] + // [d 0 0 0 0 0 0 0 -d*R1 r9+(c<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] + // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d = c * (R0 >> 4) + t0; + // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[0] = d & M; d >>= 26; + // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += c * (R1 >> 4) + t1; + // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*R1>>4 r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] + // [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[1] = d & M; d >>= 26; + // [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += t2; + // [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[2] = d; + // [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] } diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h index 9d40c9d19b..93b8e1d2f4 100644 --- a/src/field_5x52_int128_impl.h +++ b/src/field_5x52_int128_impl.h @@ -10,6 +10,9 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r) { const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; + // [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n. + // px is a shorthand for sum(a[i]*b[x-i], i=0..x). + // Note that [x 0 0 0 0 0] = [x*R]. __int128 c, d; @@ -17,56 +20,88 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uin + (__int128)a[1] * b[2] + (__int128)a[2] * b[1] + (__int128)a[3] * b[0]; + // [d 0 0 0] = [p3 0 0 0] c = (__int128)a[4] * b[4]; + // [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] d += (c & M) * R; c >>= 52; + // [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] uint64_t t3 = d & M; d >>= 52; + // [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] d += (__int128)a[0] * b[4] + (__int128)a[1] * b[3] + (__int128)a[2] * b[2] + (__int128)a[3] * b[1] + (__int128)a[4] * b[0]; + // [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] d += c * R; + // [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] uint64_t t4 = d & M; d >>= 52; + // [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] uint64_t tx = (t4 >> 48); t4 &= (M >> 4); + // [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] c = (__int128)a[0] * b[0]; + // [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] d += (__int128)a[1] * b[4] + (__int128)a[2] * b[3] + (__int128)a[3] * b[2] + (__int128)a[4] * b[1]; + // [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] uint64_t u0 = d & M; d >>= 52; + // [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] + // [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] u0 = (u0 << 4) | tx; + // [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] c += (__int128)u0 * (R >> 4); + // [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] uint64_t t0 = c & M; c >>= 52; + // [d 0 t4 t3 0 c t0] = [p8 0 0 p5 p4 p3 0 0 p0] c += (__int128)a[0] * b[1] + (__int128)a[1] * b[0]; + // [d 0 t4 t3 0 c t0] = [p8 0 0 p5 p4 p3 0 p1 p0] d += (__int128)a[2] * b[4] + (__int128)a[3] * b[3] + (__int128)a[4] * b[2]; + // [d 0 t4 t3 0 c t0] = [p8 0 p6 p5 p4 p3 0 p1 p0] c += (d & M) * R; d >>= 52; + // [d 0 0 t4 t3 0 c t0] = [p8 0 p6 p5 p4 p3 0 p1 p0] uint64_t t1 = c & M; c >>= 52; + // [d 0 0 t4 t3 c t1 t0] = [p8 0 p6 p5 p4 p3 0 p1 p0] c += (__int128)a[0] * b[2] + (__int128)a[1] * b[1] + (__int128)a[2] * b[0]; + // [d 0 0 t4 t3 c t1 t0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] d += (__int128)a[3] * b[4] + (__int128)a[4] * b[3]; + // [d 0 0 t4 t3 c t1 t0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += (d & M) * R; d >>= 52; + // [d 0 0 0 t4 t3 c t1 t0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[0] = t0; + // [d 0 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[1] = t1; + // [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[2] = c & M; c >>= 52; + // [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += d * R + t3;; + // [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[3] = c & M; c >>= 52; + // [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += t4; + // [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[4] = c; + // [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] } SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) { const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; + // [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n. + // px is a shorthand for sum(a[i]*a[x-i], i=0..x). + // Note that [x 0 0 0 0 0] = [x*R]. __int128 c, d; @@ -74,43 +109,70 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t d = (__int128)(a0*2) * a3 + (__int128)(a1*2) * a2; + // [d 0 0 0] = [p3 0 0 0] c = (__int128)a4 * a4; + // [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] d += (c & M) * R; c >>= 52; + // [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] uint64_t t3 = d & M; d >>= 52; + // [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] a4 *= 2; d += (__int128)a0 * a4 + (__int128)(a1*2) * a3 + (__int128)a2 * a2; + // [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] d += c * R; + // [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] uint64_t t4 = d & M; d >>= 52; + // [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] uint64_t tx = (t4 >> 48); t4 &= (M >> 4); + // [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] c = (__int128)a0 * a0; + // [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] d += (__int128)a1 * a4 + (__int128)(a2*2) * a3; + // [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] uint64_t u0 = d & M; d >>= 52; + // [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] + // [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] u0 = (u0 << 4) | tx; + // [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] c += (__int128)u0 * (R >> 4); + // [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] r[0] = c & M; c >>= 52; + // [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] a0 *= 2; c += (__int128)a0 * a1; + // [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] d += (__int128)a2 * a4 + (__int128)a3 * a3; + // [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] c += (d & M) * R; d >>= 52; + // [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] r[1] = c & M; c >>= 52; + // [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] c += (__int128)a0 * a2 + (__int128)a1 * a1; + // [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] d += (__int128)a3 * a4; + // [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += (d & M) * R; d >>= 52; + // [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[2] = c & M; c >>= 52; + // [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += d * R + t3;; + // [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[3] = c & M; c >>= 52; + // [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += t4; + // [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[4] = c; + // [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] } #endif From a51859871afc5532a3cc0b07bc947894211218f2 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Thu, 13 Nov 2014 07:47:40 -0800 Subject: [PATCH 3/4] Add overflow analysis to field_5x52_int128_impl.h --- src/field_5x52_int128_impl.h | 97 ++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h index 93b8e1d2f4..a7b83ca221 100644 --- a/src/field_5x52_int128_impl.h +++ b/src/field_5x52_int128_impl.h @@ -7,7 +7,23 @@ #include +#ifdef VERIFY +#define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0) +#else +#define VERIFY_BITS(x, n) do { } while(0) +#endif + SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r) { + VERIFY_BITS(a[0], 56); + VERIFY_BITS(a[1], 56); + VERIFY_BITS(a[2], 56); + VERIFY_BITS(a[3], 56); + VERIFY_BITS(a[4], 52); + VERIFY_BITS(b[0], 56); + VERIFY_BITS(b[1], 56); + VERIFY_BITS(b[2], 56); + VERIFY_BITS(b[3], 56); + VERIFY_BITS(b[4], 52); const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; // [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n. @@ -20,12 +36,18 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uin + (__int128)a[1] * b[2] + (__int128)a[2] * b[1] + (__int128)a[3] * b[0]; + VERIFY_BITS(d, 114); // [d 0 0 0] = [p3 0 0 0] c = (__int128)a[4] * b[4]; + VERIFY_BITS(c, 112); // [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] d += (c & M) * R; c >>= 52; + VERIFY_BITS(d, 115); + VERIFY_BITS(c, 60); // [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] uint64_t t3 = d & M; d >>= 52; + VERIFY_BITS(t3, 52); + VERIFY_BITS(d, 63); // [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] d += (__int128)a[0] * b[4] @@ -33,70 +55,108 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uin + (__int128)a[2] * b[2] + (__int128)a[3] * b[1] + (__int128)a[4] * b[0]; + VERIFY_BITS(d, 115); // [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] d += c * R; + VERIFY_BITS(d, 116); // [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] uint64_t t4 = d & M; d >>= 52; + VERIFY_BITS(t4, 52); + VERIFY_BITS(d, 64); // [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] uint64_t tx = (t4 >> 48); t4 &= (M >> 4); + VERIFY_BITS(tx, 4); + VERIFY_BITS(t4, 48); // [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] c = (__int128)a[0] * b[0]; + VERIFY_BITS(c, 112); // [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] d += (__int128)a[1] * b[4] + (__int128)a[2] * b[3] + (__int128)a[3] * b[2] + (__int128)a[4] * b[1]; + VERIFY_BITS(d, 115); // [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] uint64_t u0 = d & M; d >>= 52; + VERIFY_BITS(u0, 52); + VERIFY_BITS(d, 63); // [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] // [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] u0 = (u0 << 4) | tx; + VERIFY_BITS(u0, 56); // [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] c += (__int128)u0 * (R >> 4); + VERIFY_BITS(c, 115); // [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] uint64_t t0 = c & M; c >>= 52; + VERIFY_BITS(t0, 52); + VERIFY_BITS(c, 61); // [d 0 t4 t3 0 c t0] = [p8 0 0 p5 p4 p3 0 0 p0] c += (__int128)a[0] * b[1] + (__int128)a[1] * b[0]; + VERIFY_BITS(c, 114); // [d 0 t4 t3 0 c t0] = [p8 0 0 p5 p4 p3 0 p1 p0] d += (__int128)a[2] * b[4] + (__int128)a[3] * b[3] + (__int128)a[4] * b[2]; + VERIFY_BITS(d, 114); // [d 0 t4 t3 0 c t0] = [p8 0 p6 p5 p4 p3 0 p1 p0] c += (d & M) * R; d >>= 52; + VERIFY_BITS(c, 115); + VERIFY_BITS(d, 62); // [d 0 0 t4 t3 0 c t0] = [p8 0 p6 p5 p4 p3 0 p1 p0] uint64_t t1 = c & M; c >>= 52; + VERIFY_BITS(t1, 52); + VERIFY_BITS(c, 63); // [d 0 0 t4 t3 c t1 t0] = [p8 0 p6 p5 p4 p3 0 p1 p0] c += (__int128)a[0] * b[2] + (__int128)a[1] * b[1] + (__int128)a[2] * b[0]; + VERIFY_BITS(c, 114); // [d 0 0 t4 t3 c t1 t0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] d += (__int128)a[3] * b[4] + (__int128)a[4] * b[3]; + VERIFY_BITS(d, 114); // [d 0 0 t4 t3 c t1 t0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += (d & M) * R; d >>= 52; + VERIFY_BITS(c, 115); + VERIFY_BITS(d, 62); // [d 0 0 0 t4 t3 c t1 t0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[0] = t0; + VERIFY_BITS(r[0], 52); // [d 0 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[1] = t1; + VERIFY_BITS(r[1], 52); // [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[2] = c & M; c >>= 52; + VERIFY_BITS(r[2], 52); + VERIFY_BITS(c, 63); // [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += d * R + t3;; + VERIFY_BITS(c, 100); // [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[3] = c & M; c >>= 52; + VERIFY_BITS(r[3], 52); + VERIFY_BITS(c, 48); // [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += t4; + VERIFY_BITS(c, 49); // [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[4] = c; + VERIFY_BITS(r[4], 49); // [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] } SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) { + VERIFY_BITS(a[0], 56); + VERIFY_BITS(a[1], 56); + VERIFY_BITS(a[2], 56); + VERIFY_BITS(a[3], 56); + VERIFY_BITS(a[4], 52); const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; // [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n. @@ -109,69 +169,106 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t d = (__int128)(a0*2) * a3 + (__int128)(a1*2) * a2; + VERIFY_BITS(d, 114); // [d 0 0 0] = [p3 0 0 0] c = (__int128)a4 * a4; + VERIFY_BITS(c, 112); // [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] d += (c & M) * R; c >>= 52; + VERIFY_BITS(d, 115); + VERIFY_BITS(c, 60); // [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] uint64_t t3 = d & M; d >>= 52; + VERIFY_BITS(t3, 52); + VERIFY_BITS(d, 63); // [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] a4 *= 2; d += (__int128)a0 * a4 + (__int128)(a1*2) * a3 + (__int128)a2 * a2; + VERIFY_BITS(d, 115); // [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] d += c * R; + VERIFY_BITS(d, 116); // [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] uint64_t t4 = d & M; d >>= 52; + VERIFY_BITS(t4, 52); + VERIFY_BITS(d, 64); // [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] uint64_t tx = (t4 >> 48); t4 &= (M >> 4); + VERIFY_BITS(tx, 4); + VERIFY_BITS(t4, 48); // [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] c = (__int128)a0 * a0; + VERIFY_BITS(c, 112); // [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] d += (__int128)a1 * a4 + (__int128)(a2*2) * a3; + VERIFY_BITS(d, 114); // [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] uint64_t u0 = d & M; d >>= 52; + VERIFY_BITS(u0, 52); + VERIFY_BITS(d, 62); // [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] // [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] u0 = (u0 << 4) | tx; + VERIFY_BITS(u0, 56); // [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] c += (__int128)u0 * (R >> 4); + VERIFY_BITS(c, 113); // [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] r[0] = c & M; c >>= 52; + VERIFY_BITS(r[0], 52); + VERIFY_BITS(c, 61); // [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] a0 *= 2; c += (__int128)a0 * a1; + VERIFY_BITS(c, 114); // [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] d += (__int128)a2 * a4 + (__int128)a3 * a3; + VERIFY_BITS(d, 114); // [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] c += (d & M) * R; d >>= 52; + VERIFY_BITS(c, 115); + VERIFY_BITS(d, 62); // [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] r[1] = c & M; c >>= 52; + VERIFY_BITS(r[1], 52); + VERIFY_BITS(c, 63); // [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] c += (__int128)a0 * a2 + (__int128)a1 * a1; + VERIFY_BITS(c, 114); // [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] d += (__int128)a3 * a4; + VERIFY_BITS(d, 114); // [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += (d & M) * R; d >>= 52; + VERIFY_BITS(c, 115); + VERIFY_BITS(d, 62); // [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[2] = c & M; c >>= 52; + VERIFY_BITS(r[2], 52); + VERIFY_BITS(c, 63); // [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += d * R + t3;; + VERIFY_BITS(c, 100); // [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[3] = c & M; c >>= 52; + VERIFY_BITS(r[3], 52); + VERIFY_BITS(c, 48); // [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] c += t4; + VERIFY_BITS(c, 49); // [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] r[4] = c; + VERIFY_BITS(r[4], 49); // [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] } From f8cce9565014a3e892b346eb48c384148ff67093 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Fri, 14 Nov 2014 17:52:39 +0100 Subject: [PATCH 4/4] Add overflow analysis to field_10x26_impl.h --- src/field_10x26_impl.h | 212 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 1e78599c90..47e7b0df7a 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -245,7 +245,33 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1 #endif } +#ifdef VERIFY +#define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0) +#else +#define VERIFY_BITS(x, n) do { } while(0) +#endif + SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t *b, uint32_t *r) { + VERIFY_BITS(a[0], 30); + VERIFY_BITS(a[1], 30); + VERIFY_BITS(a[2], 30); + VERIFY_BITS(a[3], 30); + VERIFY_BITS(a[4], 30); + VERIFY_BITS(a[5], 30); + VERIFY_BITS(a[6], 30); + VERIFY_BITS(a[7], 30); + VERIFY_BITS(a[8], 30); + VERIFY_BITS(a[9], 26); + VERIFY_BITS(b[0], 30); + VERIFY_BITS(b[1], 30); + VERIFY_BITS(b[2], 30); + VERIFY_BITS(b[3], 30); + VERIFY_BITS(b[4], 30); + VERIFY_BITS(b[5], 30); + VERIFY_BITS(b[6], 30); + VERIFY_BITS(b[7], 30); + VERIFY_BITS(b[8], 30); + VERIFY_BITS(b[9], 26); const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL; // [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n. @@ -264,11 +290,15 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[2] + (uint64_t)a[8] * b[1] + (uint64_t)a[9] * b[0]; + // VERIFY_BITS(d, 64); // [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] uint32_t t9 = d & M; d >>= 26; + VERIFY_BITS(t9, 26); + VERIFY_BITS(d, 38); // [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] c = (uint64_t)a[0] * b[0]; + VERIFY_BITS(c, 60); // [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0] d += (uint64_t)a[1] * b[9] + (uint64_t)a[2] * b[8] @@ -279,15 +309,22 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[3] + (uint64_t)a[8] * b[2] + (uint64_t)a[9] * b[1]; + VERIFY_BITS(d, 63); // [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] uint64_t u0 = d & M; d >>= 26; c += u0 * R0; + VERIFY_BITS(u0, 26); + VERIFY_BITS(d, 37); + VERIFY_BITS(c, 61); // [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] uint32_t t0 = c & M; c >>= 26; c += u0 * R1; + VERIFY_BITS(t0, 26); + VERIFY_BITS(c, 37); // [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0] c += (uint64_t)a[0] * b[1] + (uint64_t)a[1] * b[0]; + VERIFY_BITS(c, 62); // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0] d += (uint64_t)a[2] * b[9] + (uint64_t)a[3] * b[8] @@ -297,16 +334,23 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[4] + (uint64_t)a[8] * b[3] + (uint64_t)a[9] * b[2]; + VERIFY_BITS(d, 63); // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] uint64_t u1 = d & M; d >>= 26; c += u1 * R0; + VERIFY_BITS(u1, 26); + VERIFY_BITS(d, 37); + VERIFY_BITS(c, 63); // [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] uint32_t t1 = c & M; c >>= 26; c += u1 * R1; + VERIFY_BITS(t1, 26); + VERIFY_BITS(c, 38); // [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] c += (uint64_t)a[0] * b[2] + (uint64_t)a[1] * b[1] + (uint64_t)a[2] * b[0]; + VERIFY_BITS(c, 62); // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] d += (uint64_t)a[3] * b[9] + (uint64_t)a[4] * b[8] @@ -315,10 +359,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[5] + (uint64_t)a[8] * b[4] + (uint64_t)a[9] * b[3]; + VERIFY_BITS(d, 63); // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] uint64_t u2 = d & M; d >>= 26; c += u2 * R0; + VERIFY_BITS(u2, 26); + VERIFY_BITS(d, 37); + VERIFY_BITS(c, 63); // [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] uint32_t t2 = c & M; c >>= 26; c += u2 * R1; + VERIFY_BITS(t2, 26); + VERIFY_BITS(c, 38); // [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] @@ -326,6 +376,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[1] * b[2] + (uint64_t)a[2] * b[1] + (uint64_t)a[3] * b[0]; + VERIFY_BITS(c, 63); // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] d += (uint64_t)a[4] * b[9] + (uint64_t)a[5] * b[8] @@ -333,10 +384,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[7] * b[6] + (uint64_t)a[8] * b[5] + (uint64_t)a[9] * b[4]; + VERIFY_BITS(d, 63); // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] uint64_t u3 = d & M; d >>= 26; c += u3 * R0; + VERIFY_BITS(u3, 26); + VERIFY_BITS(d, 37); + // VERIFY_BITS(c, 64); // [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] uint32_t t3 = c & M; c >>= 26; c += u3 * R1; + VERIFY_BITS(t3, 26); + VERIFY_BITS(c, 39); // [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] @@ -345,16 +402,23 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[2] * b[2] + (uint64_t)a[3] * b[1] + (uint64_t)a[4] * b[0]; + VERIFY_BITS(c, 63); // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] d += (uint64_t)a[5] * b[9] + (uint64_t)a[6] * b[8] + (uint64_t)a[7] * b[7] + (uint64_t)a[8] * b[6] + (uint64_t)a[9] * b[5]; + VERIFY_BITS(d, 62); // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] uint64_t u4 = d & M; d >>= 26; c += u4 * R0; + VERIFY_BITS(u4, 26); + VERIFY_BITS(d, 36); + // VERIFY_BITS(c, 64); // [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] uint32_t t4 = c & M; c >>= 26; c += u4 * R1; + VERIFY_BITS(t4, 26); + VERIFY_BITS(c, 39); // [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] @@ -364,15 +428,22 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[3] * b[2] + (uint64_t)a[4] * b[1] + (uint64_t)a[5] * b[0]; + VERIFY_BITS(c, 63); // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[6] * b[9] + (uint64_t)a[7] * b[8] + (uint64_t)a[8] * b[7] + (uint64_t)a[9] * b[6]; + VERIFY_BITS(d, 62); // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] uint64_t u5 = d & M; d >>= 26; c += u5 * R0; + VERIFY_BITS(u5, 26); + VERIFY_BITS(d, 36); + // VERIFY_BITS(c, 64); // [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] uint32_t t5 = c & M; c >>= 26; c += u5 * R1; + VERIFY_BITS(t5, 26); + VERIFY_BITS(c, 39); // [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] @@ -383,14 +454,21 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[4] * b[2] + (uint64_t)a[5] * b[1] + (uint64_t)a[6] * b[0]; + VERIFY_BITS(c, 63); // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[7] * b[9] + (uint64_t)a[8] * b[8] + (uint64_t)a[9] * b[7]; + VERIFY_BITS(d, 61); // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] uint64_t u6 = d & M; d >>= 26; c += u6 * R0; + VERIFY_BITS(u6, 26); + VERIFY_BITS(d, 35); + // VERIFY_BITS(c, 64); // [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] uint32_t t6 = c & M; c >>= 26; c += u6 * R1; + VERIFY_BITS(t6, 26); + VERIFY_BITS(c, 39); // [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] @@ -402,13 +480,22 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[5] * b[2] + (uint64_t)a[6] * b[1] + (uint64_t)a[7] * b[0]; + // VERIFY_BITS(c, 64); + VERIFY_CHECK(c <= 0x8000007C00000007ULL); // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[8] * b[9] + (uint64_t)a[9] * b[8]; + VERIFY_BITS(d, 58); // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] uint64_t u7 = d & M; d >>= 26; c += u7 * R0; + VERIFY_BITS(u7, 26); + VERIFY_BITS(d, 32); + // VERIFY_BITS(c, 64); + VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL); // [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] uint32_t t7 = c & M; c >>= 26; c += u7 * R1; + VERIFY_BITS(t7, 26); + VERIFY_BITS(c, 38); // [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] @@ -421,49 +508,86 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin + (uint64_t)a[6] * b[2] + (uint64_t)a[7] * b[1] + (uint64_t)a[8] * b[0]; + // VERIFY_BITS(c, 64); + VERIFY_CHECK(c <= 0x9000007B80000008ULL); // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[9] * b[9]; + VERIFY_BITS(d, 57); // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] uint64_t u8 = d & M; d >>= 26; c += u8 * R0; + VERIFY_BITS(u8, 26); + VERIFY_BITS(d, 31); + // VERIFY_BITS(c, 64); + VERIFY_CHECK(c <= 0x9000016FBFFFC2F8ULL); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[3] = t3; + VERIFY_BITS(r[3], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[4] = t4; + VERIFY_BITS(r[4], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[5] = t5; + VERIFY_BITS(r[5], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[6] = t6; + VERIFY_BITS(r[6], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[7] = t7; + VERIFY_BITS(r[7], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[8] = c & M; c >>= 26; c += u8 * R1; + VERIFY_BITS(r[8], 26); + VERIFY_BITS(c, 39); // [d u8 0 0 0 0 0 0 0 0 t9+c-u8*R1 r8-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] c += d * R0 + t9; + VERIFY_BITS(c, 45); // [d 0 0 0 0 0 0 0 0 0 c-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4); + VERIFY_BITS(r[9], 22); + VERIFY_BITS(c, 46); // [d 0 0 0 0 0 0 0 0 r9+((c-d*R1<<4)<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 0 -d*R1 r9+(c<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d = c * (R0 >> 4) + t0; + VERIFY_BITS(d, 56); // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[0] = d & M; d >>= 26; + VERIFY_BITS(r[0], 26); + VERIFY_BITS(d, 30); // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += c * (R1 >> 4) + t1; + VERIFY_BITS(d, 53); + VERIFY_CHECK(d <= 0x10000003FFFFBFULL); // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*R1>>4 r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] // [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[1] = d & M; d >>= 26; + VERIFY_BITS(r[1], 26); + VERIFY_BITS(d, 27); + VERIFY_CHECK(d <= 0x4000000ULL); // [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += t2; + VERIFY_BITS(d, 27); // [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[2] = d; + VERIFY_BITS(r[2], 27); // [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] } SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t *r) { + VERIFY_BITS(a[0], 30); + VERIFY_BITS(a[1], 30); + VERIFY_BITS(a[2], 30); + VERIFY_BITS(a[3], 30); + VERIFY_BITS(a[4], 30); + VERIFY_BITS(a[5], 30); + VERIFY_BITS(a[6], 30); + VERIFY_BITS(a[7], 30); + VERIFY_BITS(a[8], 30); + VERIFY_BITS(a[9], 26); const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL; // [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n. @@ -477,88 +601,133 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t + (uint64_t)(a[2]*2) * a[7] + (uint64_t)(a[3]*2) * a[6] + (uint64_t)(a[4]*2) * a[5]; + // VERIFY_BITS(d, 64); // [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] uint32_t t9 = d & M; d >>= 26; + VERIFY_BITS(t9, 26); + VERIFY_BITS(d, 38); // [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] c = (uint64_t)a[0] * a[0]; + VERIFY_BITS(c, 60); // [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0] d += (uint64_t)(a[1]*2) * a[9] + (uint64_t)(a[2]*2) * a[8] + (uint64_t)(a[3]*2) * a[7] + (uint64_t)(a[4]*2) * a[6] + (uint64_t)a[5] * a[5]; + VERIFY_BITS(d, 63); // [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] uint64_t u0 = d & M; d >>= 26; c += u0 * R0; + VERIFY_BITS(u0, 26); + VERIFY_BITS(d, 37); + VERIFY_BITS(c, 61); // [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] uint32_t t0 = c & M; c >>= 26; c += u0 * R1; + VERIFY_BITS(t0, 26); + VERIFY_BITS(c, 37); // [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0] c += (uint64_t)(a[0]*2) * a[1]; + VERIFY_BITS(c, 62); // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0] d += (uint64_t)(a[2]*2) * a[9] + (uint64_t)(a[3]*2) * a[8] + (uint64_t)(a[4]*2) * a[7] + (uint64_t)(a[5]*2) * a[6]; + VERIFY_BITS(d, 63); // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] uint64_t u1 = d & M; d >>= 26; c += u1 * R0; + VERIFY_BITS(u1, 26); + VERIFY_BITS(d, 37); + VERIFY_BITS(c, 63); // [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] uint32_t t1 = c & M; c >>= 26; c += u1 * R1; + VERIFY_BITS(t1, 26); + VERIFY_BITS(c, 38); // [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] c += (uint64_t)(a[0]*2) * a[2] + (uint64_t)a[1] * a[1]; + VERIFY_BITS(c, 62); // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] d += (uint64_t)(a[3]*2) * a[9] + (uint64_t)(a[4]*2) * a[8] + (uint64_t)(a[5]*2) * a[7] + (uint64_t)a[6] * a[6]; + VERIFY_BITS(d, 63); // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] uint64_t u2 = d & M; d >>= 26; c += u2 * R0; + VERIFY_BITS(u2, 26); + VERIFY_BITS(d, 37); + VERIFY_BITS(c, 63); // [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] uint32_t t2 = c & M; c >>= 26; c += u2 * R1; + VERIFY_BITS(t2, 26); + VERIFY_BITS(c, 38); // [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[3] + (uint64_t)(a[1]*2) * a[2]; + VERIFY_BITS(c, 63); // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] d += (uint64_t)(a[4]*2) * a[9] + (uint64_t)(a[5]*2) * a[8] + (uint64_t)(a[6]*2) * a[7]; + VERIFY_BITS(d, 63); // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] uint64_t u3 = d & M; d >>= 26; c += u3 * R0; + VERIFY_BITS(u3, 26); + VERIFY_BITS(d, 37); + // VERIFY_BITS(c, 64); // [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] uint32_t t3 = c & M; c >>= 26; c += u3 * R1; + VERIFY_BITS(t3, 26); + VERIFY_BITS(c, 39); // [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[4] + (uint64_t)(a[1]*2) * a[3] + (uint64_t)a[2] * a[2]; + VERIFY_BITS(c, 63); // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] d += (uint64_t)(a[5]*2) * a[9] + (uint64_t)(a[6]*2) * a[8] + (uint64_t)a[7] * a[7]; + VERIFY_BITS(d, 62); // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] uint64_t u4 = d & M; d >>= 26; c += u4 * R0; + VERIFY_BITS(u4, 26); + VERIFY_BITS(d, 36); + // VERIFY_BITS(c, 64); // [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] uint32_t t4 = c & M; c >>= 26; c += u4 * R1; + VERIFY_BITS(t4, 26); + VERIFY_BITS(c, 39); // [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] c += (uint64_t)(a[0]*2) * a[5] + (uint64_t)(a[1]*2) * a[4] + (uint64_t)(a[2]*2) * a[3]; + VERIFY_BITS(c, 63); // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] d += (uint64_t)(a[6]*2) * a[9] + (uint64_t)(a[7]*2) * a[8]; + VERIFY_BITS(d, 62); // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] uint64_t u5 = d & M; d >>= 26; c += u5 * R0; + VERIFY_BITS(u5, 26); + VERIFY_BITS(d, 36); + // VERIFY_BITS(c, 64); // [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] uint32_t t5 = c & M; c >>= 26; c += u5 * R1; + VERIFY_BITS(t5, 26); + VERIFY_BITS(c, 39); // [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] @@ -566,13 +735,20 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t + (uint64_t)(a[1]*2) * a[5] + (uint64_t)(a[2]*2) * a[4] + (uint64_t)a[3] * a[3]; + VERIFY_BITS(c, 63); // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)(a[7]*2) * a[9] + (uint64_t)a[8] * a[8]; + VERIFY_BITS(d, 61); // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] uint64_t u6 = d & M; d >>= 26; c += u6 * R0; + VERIFY_BITS(u6, 26); + VERIFY_BITS(d, 35); + // VERIFY_BITS(c, 64); // [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] uint32_t t6 = c & M; c >>= 26; c += u6 * R1; + VERIFY_BITS(t6, 26); + VERIFY_BITS(c, 39); // [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] @@ -580,12 +756,21 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t + (uint64_t)(a[1]*2) * a[6] + (uint64_t)(a[2]*2) * a[5] + (uint64_t)(a[3]*2) * a[4]; + // VERIFY_BITS(c, 64); + VERIFY_CHECK(c <= 0x8000007C00000007ULL); // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)(a[8]*2) * a[9]; + VERIFY_BITS(d, 58); // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] uint64_t u7 = d & M; d >>= 26; c += u7 * R0; + VERIFY_BITS(u7, 26); + VERIFY_BITS(d, 32); + // VERIFY_BITS(c, 64); + VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL); // [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] uint32_t t7 = c & M; c >>= 26; c += u7 * R1; + VERIFY_BITS(t7, 26); + VERIFY_BITS(c, 38); // [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] @@ -594,45 +779,72 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t + (uint64_t)(a[2]*2) * a[6] + (uint64_t)(a[3]*2) * a[5] + (uint64_t)a[4] * a[4]; + // VERIFY_BITS(c, 64); + VERIFY_CHECK(c <= 0x9000007B80000008ULL); // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += (uint64_t)a[9] * a[9]; + VERIFY_BITS(d, 57); // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] uint64_t u8 = d & M; d >>= 26; c += u8 * R0; + VERIFY_BITS(u8, 26); + VERIFY_BITS(d, 31); + // VERIFY_BITS(c, 64); + VERIFY_CHECK(c <= 0x9000016FBFFFC2F8ULL); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[3] = t3; + VERIFY_BITS(r[3], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[4] = t4; + VERIFY_BITS(r[4], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[5] = t5; + VERIFY_BITS(r[5], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[6] = t6; + VERIFY_BITS(r[6], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[7] = t7; + VERIFY_BITS(r[7], 26); // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[8] = c & M; c >>= 26; c += u8 * R1; + VERIFY_BITS(r[8], 26); + VERIFY_BITS(c, 39); // [d u8 0 0 0 0 0 0 0 0 t9+c-u8*R1 r8-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] c += d * R0 + t9; + VERIFY_BITS(c, 45); // [d 0 0 0 0 0 0 0 0 0 c-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4); + VERIFY_BITS(r[9], 22); + VERIFY_BITS(c, 46); // [d 0 0 0 0 0 0 0 0 r9+((c-d*R1<<4)<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] // [d 0 0 0 0 0 0 0 -d*R1 r9+(c<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d = c * (R0 >> 4) + t0; + VERIFY_BITS(d, 56); // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[0] = d & M; d >>= 26; + VERIFY_BITS(r[0], 26); + VERIFY_BITS(d, 30); // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += c * (R1 >> 4) + t1; + VERIFY_BITS(d, 53); + VERIFY_CHECK(d <= 0x10000003FFFFBFULL); // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*R1>>4 r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] // [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[1] = d & M; d >>= 26; + VERIFY_BITS(r[1], 26); + VERIFY_BITS(d, 27); + VERIFY_CHECK(d <= 0x4000000ULL); // [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] d += t2; + VERIFY_BITS(d, 27); // [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] r[2] = d; + VERIFY_BITS(r[2], 27); // [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] }