mratsim · mratsim · Sep 3, 2020 · Sep 2, 2020 · Sep 2, 2020 · Sep 2, 2020
diff --git a/benchmarks/bench_ec_g1.nim b/benchmarks/bench_ec_g1.nim
@@ -53,13 +53,13 @@ proc main() =
     separator()
     scalarMulUnsafeDoubleAddBench(ECP_SWei_Proj[Fp[curve]], MulIters)
     separator()
-    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 2, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], window = 2, MulIters)
     separator()
-    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 3, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], window = 3, MulIters)
     separator()
-    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 4, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], window = 4, MulIters)
     separator()
-    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 5, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], window = 5, MulIters)
     separator()
     scalarMulEndo(ECP_SWei_Proj[Fp[curve]], MulIters)
     separator()

diff --git a/benchmarks/bench_ec_g2.nim b/benchmarks/bench_ec_g2.nim
@@ -54,14 +54,16 @@ proc main() =
     separator()
     scalarMulUnsafeDoubleAddBench(ECP_SWei_Proj[Fp2[curve]], MulIters)
     separator()
-    scalarMulGenericBench(ECP_SWei_Proj[Fp2[curve]], scratchSpaceSize = 1 shl 2, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp2[curve]], window = 2, MulIters)
     separator()
-    scalarMulGenericBench(ECP_SWei_Proj[Fp2[curve]], scratchSpaceSize = 1 shl 3, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp2[curve]], window = 3, MulIters)
     separator()
-    scalarMulGenericBench(ECP_SWei_Proj[Fp2[curve]], scratchSpaceSize = 1 shl 4, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp2[curve]], window = 4, MulIters)
+    separator()
+    scalarMulGenericBench(ECP_SWei_Proj[Fp2[curve]], window = 5, MulIters)
+    separator()
+    scalarMulEndo(ECP_SWei_Proj[Fp2[curve]], MulIters)
     separator()
-    # scalarMulEndo(ECP_SWei_Proj[Fp2[curve]], MulIters)
-    # separator()
   separator()
 
 main()

diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim
@@ -92,9 +92,12 @@ proc notes*() =
   echo "Notes:"
   echo "  - Compilers:"
   echo "    Compilers are severely limited on multiprecision arithmetic."
-  echo "    Inline Assembly is used by default (nimble bench_fp)."
-  echo "    Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
+  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
   echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
+  echo "    GCC also seems to have issues with large temporaries and register spilling."
+  echo "    This is somewhat alleviated by Constantine compile-time assembler."
+  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
+  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
   echo "  - The simplest operations might be optimized away by the compiler."
   echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
 
@@ -139,22 +142,18 @@ proc doublingBench*(T: typedesc, iters: int) =
   bench("EC Double " & G1_or_G2, T, iters):
     r.double(P)
 
-proc scalarMulGenericBench*(T: typedesc, scratchSpaceSize: static int, iters: int) =
+proc scalarMulGenericBench*(T: typedesc, window: static int, iters: int) =
   const bits = T.F.C.getCurveOrderBitwidth()
   const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
 
   var r {.noInit.}: T
   let P = rng.random_unsafe(T) # TODO: clear cofactor
 
   let exponent = rng.random_unsafe(BigInt[bits])
-  var exponentCanonical{.noInit.}: array[(bits+7) div 8, byte]
-  exponentCanonical.exportRawUint(exponent, bigEndian)
 
-  var scratchSpace{.noInit.}: array[scratchSpaceSize, T]
-
-  bench("EC ScalarMul Generic " & G1_or_G2 & " (scratchsize = " & $scratchSpaceSize & ')', T, iters):
+  bench("EC ScalarMul Generic " & G1_or_G2 & " (window = " & $window & ", scratchsize = " & $(1 shl window) & ')', T, iters):
     r = P
-    r.scalarMulGeneric(exponentCanonical, scratchSpace)
+    r.scalarMulGeneric(exponent, window)
 
 proc scalarMulEndo*(T: typedesc, iters: int) =
   const bits = T.F.C.getCurveOrderBitwidth()
@@ -167,10 +166,7 @@ proc scalarMulEndo*(T: typedesc, iters: int) =
 
   bench("EC ScalarMul " & G1_or_G2 & " (endomorphism accelerated)", T, iters):
     r = P
-    when T.F is Fp:
-      r.scalarMulGLV(exponent)
-    else:
-      {.error: "Not implemented".}
+    r.scalarMulEndo(exponent)
 
 proc scalarMulEndoWindow*(T: typedesc, iters: int) =
   const bits = T.F.C.getCurveOrderBitwidth()
@@ -196,9 +192,7 @@ proc scalarMulUnsafeDoubleAddBench*(T: typedesc, iters: int) =
   let P = rng.random_unsafe(T) # TODO: clear cofactor
 
   let exponent = rng.random_unsafe(BigInt[bits])
-  var exponentCanonical{.noInit.}: array[(bits+7) div 8, byte]
-  exponentCanonical.exportRawUint(exponent, bigEndian)
 
   bench("EC ScalarMul " & G1_or_G2 & " (unsafe reference DoubleAdd)", T, iters):
     r = P
-    r.unsafe_ECmul_double_add(exponentCanonical)
+    r.unsafe_ECmul_double_add(exponent)
diff --git a/benchmarks/bench_fields_template.nim b/benchmarks/bench_fields_template.nim
@@ -89,9 +89,12 @@ proc notes*() =
   echo "Notes:"
   echo "  - Compilers:"
   echo "    Compilers are severely limited on multiprecision arithmetic."
-  echo "    Inline Assembly is used by default (nimble bench_fp)."
-  echo "    Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
+  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
   echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
+  echo "    GCC also seems to have issues with large temporaries and register spilling."
+  echo "    This is somewhat alleviated by Constantine compile-time assembler."
+  echo "    Bench on specific compiler with assembler: \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
+  echo "    Bench on specific compiler with assembler: \"nimble bench_fp_gcc_noasm\" or \"nimble bench_fp_clang_noasm\"."
   echo "  - The simplest operations might be optimized away by the compiler."
   echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
 

diff --git a/constantine/arithmetic/bigints.nim b/constantine/arithmetic/bigints.nim
@@ -95,10 +95,12 @@ func cswap*(a, b: var BigInt, ctl: CTBool) =
 func copyTruncatedFrom*[dBits, sBits: static int](dst: var BigInt[dBits], src: BigInt[sBits]) =
   ## Copy `src` into `dst`
   ## if `dst` is not big enough, only the low words are copied
-  ## if `src` is smaller than `dst` the higher words of `dst` will NOT be overwritten
+  ## if `src` is smaller than `dst` the higher words of `dst` will be overwritten
 
   for wordIdx in 0 ..< min(dst.limbs.len, src.limbs.len):
     dst.limbs[wordIdx] = src.limbs[wordIdx]
+  for wordIdx in min(dst.limbs.len, src.limbs.len) ..< dst.limbs.len:
+    dst.limbs[wordIdx] = SecretWord(0)
 
 # Comparison
 # ------------------------------------------------------------
@@ -128,6 +130,23 @@ func isOdd*(a: BigInt): SecretBool =
   ## Returns true if a is odd
   a.limbs.isOdd
 
+func isMsbSet*(a: BigInt): SecretBool =
+  ## Returns true if MSB is set
+  ## i.e. if a BigInt is interpreted
+  ## as signed AND the full bitwidth
+  ## is not used by construction
+  ## This is equivalent to checking
+  ## if the number is negative
+
+  # MSB is at announced bits - (wordsRequired - 1)
+  const msb_pos = BigInt.bits-1 - (BigInt.bits.wordsRequired - 1)
+  SecretBool((BaseType(a.limbs[a.limbs.len-1]) shr msb_pos) and 1)
+
+func eq*(a: BigInt, n: SecretWord): SecretBool =
+  ## Returns true if ``a`` is equal
+  ## to the specified small word
+  a.limbs.eq n
+
 # Arithmetic
 # ------------------------------------------------------------
 

diff --git a/constantine/arithmetic/limbs.nim b/constantine/arithmetic/limbs.nim
@@ -131,12 +131,17 @@ func isZero*(a: Limbs): SecretBool =
     accum = accum or a[i]
   result = accum.isZero()
 
-func isOne*(a: Limbs): SecretBool =
-  ## Returns true if ``a`` is equal to one
-  result = a[0] == SecretWord(1)
+func eq*(a: Limbs, n: SecretWord): SecretBool =
+  ## Returns true if ``a`` is equal
+  ## to the specified small word
+  result = a[0] == n
   for i in 1 ..< a.len:
     result = result and a[i].isZero()
 
+func isOne*(a: Limbs): SecretBool =
+  ## Returns true if ``a`` is equal to one
+  a.eq(SecretWord(1))
+
 func isOdd*(a: Limbs): SecretBool =
   ## Returns true if a is odd
   SecretBool(a[0] and SecretWord(1))

diff --git a/constantine/elliptic/ec_endomorphism_accel.nim b/constantine/elliptic/ec_endomorphism_accel.nim
@@ -15,6 +15,7 @@ import
   ../arithmetic,
   ../io/io_bigints,
   ../towers,
+  ../isogeny/frobenius,
   ./ec_weierstrass_affine,
   ./ec_weierstrass_projective,
   ./ec_endomorphism_params
@@ -192,7 +193,7 @@ func secretLookup[T](dst: var T, table: openArray[T], index: SecretWord) =
     let selector = SecretWord(i) == index
     dst.ccopy(table[i], selector)
 
-func scalarMulGLV*[scalBits](
+func scalarMulEndo*[scalBits](
        P: var ECP_SWei_Proj,
        scalar: BigInt[scalBits]
      ) =
@@ -201,35 +202,51 @@ func scalarMulGLV*[scalBits](
   ##   P <- [k] P
   ##
   ## This is a scalar multiplication accelerated by an endomorphism
-  ## via the GLV (Gallant-lambert-Vanstone) decomposition.
+  ## - via the GLV (Gallant-lambert-Vanstone) decomposition on G1
+  ## - via the GLS (Galbraith-Lin-Scott) decomposition on G2
+  ##
+  ## Requires:
+  ## - Cofactor to be cleared
+  ## - 0 <= scalar < curve order
   const C = P.F.C # curve
-  static: doAssert: scalBits == C.getCurveOrderBitwidth()
+  static: doAssert scalBits <= C.getCurveOrderBitwidth(), "Do not use endomorphism to multiply beyond the curve order"
   when P.F is Fp:
     const M = 2
-
-  # 1. Compute endomorphisms
-  var endomorphisms {.noInit.}: array[M-1, typeof(P)]
-  endomorphisms[0] = P
-  endomorphisms[0].x *= C.getCubicRootOfUnity_mod_p()
+    # 1. Compute endomorphisms
+    var endomorphisms {.noInit.}: array[M-1, typeof(P)]
+    endomorphisms[0] = P
+    endomorphisms[0].x *= C.getCubicRootOfUnity_mod_p()
+  elif P.F is Fp2:
+    const M = 4
+    # 1. Compute endomorphisms
+    var endomorphisms {.noInit.}: array[M-1, typeof(P)]
+    endomorphisms[0].frobenius_psi(P)
+    endomorphisms[1].frobenius_psi2(P)
+    endomorphisms[2].frobenius_psi(endomorphisms[1])
+  else:
+    {.error: "Unconfigured".}
 
   # 2. Decompose scalar into mini-scalars
-  const L = (C.getCurveOrderBitwidth() + M - 1) div M + 1
+  const L = (scalBits + M - 1) div M + 1 + 1 # A "+1" to handle negative
   var miniScalars {.noInit.}: array[M, BigInt[L]]
-  when C == BN254_Snarks:
-    scalar.decomposeScalar_BN254_Snarks_G1(
-      miniScalars
-    )
-  elif C == BLS12_381:
-    scalar.decomposeScalar_BLS12_381_G1(
-      miniScalars
-    )
-  else:
-    {.error: "Unsupported curve for GLV acceleration".}
+  miniScalars.decomposeEndo(scalar, P.F)
 
-  # 3. TODO: handle negative mini-scalars
-  #    Either negate the associated base and the scalar (in the `endomorphisms` array)
-  #    Or use Algorithm 3 from Faz et al which can encode the sign
-  #    in the GLV representation at the low low price of 1 bit
+  # 3. Handle negative mini-scalars
+  # A scalar decomposition might lead to negative miniscalar.
+  # For proper handling it requires either:
+  # 1. Negating it and then negating the corresponding curve point P
+  # 2. Adding an extra bit to the recoding, which will do the right thing™
+  #
+  # For implementation solution 1 is faster:
+  #   - Double + Add is about 5000~8000 cycles on 6 64-bits limbs (BLS12-381)
+  #   - Conditional negate is about 10 cycles per Fp, on G2 projective we have 3 (coords) * 2 (Fp2) * 10 (cycles) ~= 60 cycles
+  #     We need to test the mini scalar, which is 65 bits so 2 Fp so about 2 cycles
+  #     and negate it as well.
+  #
+  # However solution 1 seems to cause issues (TODO)
+  # with some of the BLS12-381 test cases (6 and 9)
+  # - 0x5668a2332db27199dcfb7cbdfca6317c2ff128db26d7df68483e0a095ec8e88f
+  # - 0x644dc62869683f0c93f38eaef2ba6912569dc91ec2806e46b4a3dd6a4421dad1
 
   # 4. Precompute lookup table
   var lut {.noInit.}: array[1 shl (M-1), ECP_SWei_Proj]
@@ -358,8 +375,8 @@ func w2TableIndex(glv: GLV_SAC, bit2: int, isNeg: var SecretBool): SecretWord {.
 func computeRecodedLength(bitWidth, window: int): int =
   # Strangely in the paper this doesn't depend
   # "m", the GLV decomposition dimension.
-  # lw = ⌈log2 r/w⌉+1
-  let lw = ((bitWidth + window - 1) div window + 1)
+  # lw = ⌈log2 r/w⌉+1+1 (a "+1" to handle negative mini scalars)
+  let lw = (bitWidth + window - 1) div window + 1 + 1
   result = (lw mod window) + lw
 
 func scalarMulGLV_m2w2*[scalBits](
@@ -374,6 +391,10 @@ func scalarMulGLV_m2w2*[scalBits](
   ## via the GLV (Gallant-lambert-Vanstone) decomposition.
   ##
   ## For 2-dimensional decomposition with window 2
+  ##
+  ## Requires:
+  ## - Cofactor to be cleared
+  ## - 0 <= scalar < curve order
   const C = P0.F.C # curve
   static: doAssert: scalBits == C.getCurveOrderBitwidth()
 
@@ -384,16 +405,7 @@ func scalarMulGLV_m2w2*[scalBits](
   # 2. Decompose scalar into mini-scalars
   const L = computeRecodedLength(C.getCurveOrderBitwidth(), 2)
   var miniScalars {.noInit.}: array[2, BigInt[L]]
-  when C == BN254_Snarks:
-    scalar.decomposeScalar_BN254_Snarks_G1(
-      miniScalars
-    )
-  elif C == BLS12_381:
-    scalar.decomposeScalar_BLS12_381_G1(
-      miniScalars
-    )
-  else:
-    {.error: "Unsupported curve for GLV acceleration".}
+  miniScalars.decomposeEndo(scalar, P0.F)
 
   # 3. TODO: handle negative mini-scalars
   #    Either negate the associated base and the scalar (in the `endomorphisms` array)
@@ -553,7 +565,7 @@ when isMainModule:
       )
 
       var decomp: MultiScalar[M, L]
-      decomposeScalar_BN254_Snarks_G1(scalar, decomp)
+      decomp.decomposeEndo(scalar, Fp[BN254_Snarks])
 
       doAssert: bool(decomp[0] == BigInt[L].fromHex"14928105460c820ccc9a25d0d953dbfe")
       doAssert: bool(decomp[1] == BigInt[L].fromHex"13a2f911eb48a578844b901de6f41660")
@@ -564,7 +576,7 @@ when isMainModule:
       )
 
       var decomp: MultiScalar[M, L]
-      decomposeScalar_BN254_Snarks_G1(scalar, decomp)
+      decomp.decomposeEndo(scalar, Fp[BN254_Snarks])
 
       doAssert: bool(decomp[0] == BigInt[L].fromHex"28cf7429c3ff8f7e82fc419e90cc3a2")
       doAssert: bool(decomp[1] == BigInt[L].fromHex"457efc201bdb3d2e6087df36430a6db6")
@@ -575,7 +587,7 @@ when isMainModule:
       )
 
       var decomp: MultiScalar[M, L]
-      decomposeScalar_BN254_Snarks_G1(scalar, decomp)
+      decomp.decomposeEndo(scalar, Fp[BN254_Snarks])
 
       doAssert: bool(decomp[0] == BigInt[L].fromHex"4da8c411566c77e00c902eb542aaa66b")
       doAssert: bool(decomp[1] == BigInt[L].fromHex"5aa8f2f15afc3217f06677702bd4e41a")