mratsim · mratsim · Aug 20, 2020 · Jul 25, 2020 · Jul 25, 2020 · Jul 25, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -13,6 +13,7 @@ matrix:
     # Build and test on both x86-64 and ARM64
     # Ubuntu Bionic (18.04) is needed, it includes
     # GCC 7 codegen fixes to addcarry_u64.
+    # Clang 9 (and GCC-6) are needed for inline assembly "flag output constraints"
     - dist: bionic
       arch: amd64
       env:
@@ -33,9 +34,19 @@ matrix:
         - ARCH=amd64
         - CHANNEL=devel
       compiler: clang
+      # addons:
+      #   apt:
+      #     sources:
+      #       - ubuntu-toolchain-r-test
+      #       - llvm-toolchain-bionic-9.0 # LLVM 9 repo is disallowed
+      #     packages:
+      #       - clang-9.0
+      # env:
+      #   - MATRIX_EVAL="CC=clang-9.0 && CXX=clang++-9.0"
 
     # On OSX we only test against clang (gcc is mapped to clang by default)
     - os: osx
+      osx_image: xcode11.5 # Need xcode 11.4.2 min for Clang 9
       arch: amd64
       env:
         - ARCH=amd64
@@ -98,9 +109,17 @@ before_script:
 script:
     - nimble refresh
     - nimble install gmp stew
-    - nimble test_parallel
-    - if [[ "$ARCH" != "arm64" ]]; then
-        nimble test_parallel_no_assembler;
+    # Installing Clang9.0 or later is a pain in Travis
+    # for inline assembly "flag output constraint"
+    # Also MacOS build is timing out with 2 series of tests.
+    - |
+      if [[ "$TRAVIS_COMPILER" == "clang" ]]; then
+        nimble test_parallel_no_assembler
+      else
+        nimble test_parallel
+        if [[ "$ARCH" != "arm64" ]]; then
+          nimble test_parallel_no_assembler
+        fi
       fi
 branches:
   except:

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -36,7 +36,7 @@ strategy:
     #   TEST_LANG: c
 
     MacOS_devel_64bit:
-      VM: 'macOS-10.14'
+      VM: 'macOS-10.15'
       UCPU: amd64
       CHANNEL: devel
       TEST_LANG: c

diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim
@@ -61,7 +61,7 @@ echo "Optimization level => "
 echo "  no optimization: ", not defined(release)
 echo "  release: ", defined(release)
 echo "  danger: ", defined(danger)
-echo "  inline assembly: ", UseX86ASM
+echo "  inline assembly: ", UseASM_X86_64
 
 when (sizeof(int) == 4) or defined(Constantine32):
   echo "⚠️ Warning: using Constantine with 32-bit limbs"

diff --git a/benchmarks/bench_fields_template.nim b/benchmarks/bench_fields_template.nim
@@ -58,7 +58,7 @@ echo "Optimization level => "
 echo "  no optimization: ", not defined(release)
 echo "  release: ", defined(release)
 echo "  danger: ", defined(danger)
-echo "  inline assembly: ", UseX86ASM
+echo "  inline assembly: ", UseASM_X86_64
 
 when (sizeof(int) == 4) or defined(Constantine32):
   echo "⚠️ Warning: using Constantine with 32-bit limbs"

diff --git a/benchmarks/bench_fp_double_width.nim b/benchmarks/bench_fp_double_width.nim
@@ -0,0 +1,196 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#             Benchmark of finite fields
+#
+# ############################################################
+
+import
+  # Internals
+  ../constantine/config/[curves, common],
+  ../constantine/arithmetic,
+  ../constantine/towers,
+  # Helpers
+  ../helpers/[prng_unsafe, static_for],
+  ./platforms,
+  # Standard library
+  std/[monotimes, times, strformat, strutils, macros]
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "bench xoshiro512** seed: ", seed
+
+# warmup
+proc warmup*() =
+  # Warmup - make sure cpu is on max perf
+  let start = cpuTime()
+  var foo = 123
+  for i in 0 ..< 300_000_000:
+    foo += i*i mod 456
+    foo = foo mod 789
+
+  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
+  let stop = cpuTime()
+  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
+
+warmup()
+
+when defined(gcc):
+  echo "\nCompiled with GCC"
+elif defined(clang):
+  echo "\nCompiled with Clang"
+elif defined(vcc):
+  echo "\nCompiled with MSVC"
+elif defined(icc):
+  echo "\nCompiled with ICC"
+else:
+  echo "\nCompiled with an unknown compiler"
+
+echo "Optimization level => "
+echo "  no optimization: ", not defined(release)
+echo "  release: ", defined(release)
+echo "  danger: ", defined(danger)
+echo "  inline assembly: ", UseASM_X86_64
+
+when (sizeof(int) == 4) or defined(Constantine32):
+  echo "⚠️ Warning: using Constantine with 32-bit limbs"
+else:
+  echo "Using Constantine with 64-bit limbs"
+
+when SupportsCPUName:
+  echo "Running on ", cpuName(), ""
+
+when SupportsGetTicks:
+  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
+  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
+
+echo "\n=================================================================================================================\n"
+
+proc separator*() =
+  echo "-".repeat(145)
+
+proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stop-start) div iters)
+  let throughput = 1e9 / float64(ns)
+  when SupportsGetTicks:
+    echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+  else:
+    echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+
+proc notes*() =
+  echo "Notes:"
+  echo "  - Compilers:"
+  echo "    Compilers are severely limited on multiprecision arithmetic."
+  echo "    Inline Assembly is used by default (nimble bench_fp)."
+  echo "    Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
+  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
+  echo "  - The simplest operations might be optimized away by the compiler."
+  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
+
+template bench(op: string, desc: string, iters: int, body: untyped): untyped =
+  let start = getMonotime()
+  when SupportsGetTicks:
+    let startClk = getTicks()
+  for _ in 0 ..< iters:
+    body
+  when SupportsGetTicks:
+    let stopClk = getTicks()
+  let stop = getMonotime()
+
+  when not SupportsGetTicks:
+    let startClk = -1'i64
+    let stopClk = -1'i64
+
+  report(op, desc, start, stop, startClk, stopClk, iters)
+
+func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
+  ## Initialize a standalone Double-Width field element
+  ## we don't reduce it modulo p², this is only used for benchmark
+  let aHi = rng.random_unsafe(Base)
+  let aLo = rng.random_unsafe(Base)
+  for i in 0 ..< aLo.mres.limbs.len:
+    a.limbs2x[i] = aLo.mres.limbs[i]
+  for i in 0 ..< aHi.mres.limbs.len:
+    a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]
+
+proc sumNoReduce(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  let b = rng.random_unsafe(T)
+  bench("Addition no reduce", $T, iters):
+    r.sumNoReduce(a, b)
+
+proc sum(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  let b = rng.random_unsafe(T)
+  bench("Addition", $T, iters):
+    r.sum(a, b)
+
+proc diffNoReduce(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  let b = rng.random_unsafe(T)
+  bench("Substraction no reduce", $T, iters):
+    r.diffNoReduce(a, b)
+
+proc diff(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  let b = rng.random_unsafe(T)
+  bench("Substraction", $T, iters):
+    r.diff(a, b)
+
+proc diff2xNoReduce(T: typedesc, iters: int) =
+  var r, a, b: doubleWidth(T)
+  rng.random_unsafe(r, T)
+  rng.random_unsafe(a, T)
+  rng.random_unsafe(b, T)
+  bench("Substraction 2x no reduce", $doubleWidth(T), iters):
+    r.diffNoReduce(a, b)
+
+proc diff2x(T: typedesc, iters: int) =
+  var r, a, b: doubleWidth(T)
+  rng.random_unsafe(r, T)
+  rng.random_unsafe(a, T)
+  rng.random_unsafe(b, T)
+  bench("Substraction 2x", $doubleWidth(T), iters):
+    r.diff(a, b)
+
+proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
+  var r: BigInt[rLen]
+  let a = rng.random_unsafe(BigInt[aLen])
+  let b = rng.random_unsafe(BigInt[bLen])
+  bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
+    r.prod(a, b)
+
+proc reduce2x*(T: typedesc, iters: int) =
+  var r: T
+  var t: doubleWidth(T)
+  rng.random_unsafe(t, T)
+
+  bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):
+    r.reduce(t)
+
+proc main() =
+  separator()
+  sumNoReduce(Fp[BLS12_381], iters = 10_000_000)
+  diffNoReduce(Fp[BLS12_381], iters = 10_000_000)
+  sum(Fp[BLS12_381], iters = 10_000_000)
+  diff(Fp[BLS12_381], iters = 10_000_000)
+  diff2x(Fp[BLS12_381], iters = 10_000_000)
+  diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
+  mul2xBench(768, 384, 384, iters = 10_000_000)
+  reduce2x(Fp[BLS12_381], iters = 10_000_000)
+  separator()
+
+main()
+notes()
diff --git a/constantine.nimble b/constantine.nimble
@@ -35,6 +35,8 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
   ("tests/t_finite_fields_vs_gmp.nim", true),
   # Precompute
   ("tests/t_precomputed", false),
+  # Double-width finite fields
+  ("tests/t_finite_fields_double_width.nim", false),
   # Towers of extension fields
   ("tests/t_fp2.nim", false),
   ("tests/t_fp2_sqrt.nim", false),
@@ -100,13 +102,15 @@ proc test(flags, path: string, commandFile = false) =
     # commandFile.writeLine command
     exec "echo \'" & command & "\' >> " & buildParallel
 
-proc runBench(benchName: string, compiler = "") =
+proc runBench(benchName: string, compiler = "", useAsm = true) =
   if not dirExists "build":
     mkDir "build"
 
   var cc = ""
   if compiler != "":
-    cc = "--cc:" & compiler & " -d:ConstantineASM=false"
+    cc = "--cc:" & compiler
+  if not useAsm:
+    cc &= " -d:ConstantineASM=false"
   exec "nim c " & cc &
        " -d:danger --verbosity:0 -o:build/" & benchName & "_" & compiler &
        " -r --hints:off --warnings:off benchmarks/" & benchName & ".nim"
@@ -298,6 +302,27 @@ task bench_fp_gcc, "Run benchmark 𝔽p with gcc":
 task bench_fp_clang, "Run benchmark 𝔽p with clang":
   runBench("bench_fp", "clang")
 
+task bench_fp_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
+  runBench("bench_fp", "gcc", useAsm = false)
+
+task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
+  runBench("bench_fp", "clang", useAsm = false)
+
+task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
+  runBench("bench_fp_double_width")
+
+task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
+  runBench("bench_fp_double_width", "gcc")
+
+task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
+  runBench("bench_fp_double_width", "clang")
+
+task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
+  runBench("bench_fp_double_width", "gcc", useAsm = false)
+
+task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
+  runBench("bench_fp_double_width", "clang", useAsm = false)
+
 task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
   runBench("bench_fp2")
 
@@ -307,6 +332,12 @@ task bench_fp2_gcc, "Run benchmark 𝔽p2 with gcc":
 task bench_fp2_clang, "Run benchmark 𝔽p2 with clang":
   runBench("bench_fp2", "clang")
 
+task bench_fp2_gcc_noasm, "Run benchmark 𝔽p2 with gcc - no Assembly":
+  runBench("bench_fp2", "gcc", useAsm = false)
+
+task bench_fp2_clang_noasm, "Run benchmark 𝔽p2 with clang - no Assembly":
+  runBench("bench_fp2", "clang", useAsm = false)
+
 task bench_fp6, "Run benchmark with 𝔽p6 your default compiler":
   runBench("bench_fp6")
 
@@ -316,6 +347,12 @@ task bench_fp6_gcc, "Run benchmark 𝔽p6 with gcc":
 task bench_fp6_clang, "Run benchmark 𝔽p6 with clang":
   runBench("bench_fp6", "clang")
 
+task bench_fp6_gcc_noasm, "Run benchmark 𝔽p6 with gcc - no Assembly":
+  runBench("bench_fp6", "gcc", useAsm = false)
+
+task bench_fp6_clang_noasm, "Run benchmark 𝔽p6 with clang - no Assembly":
+  runBench("bench_fp6", "clang", useAsm = false)
+
 task bench_fp12, "Run benchmark with 𝔽p12 your default compiler":
   runBench("bench_fp12")
 
@@ -325,6 +362,12 @@ task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc":
 task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
   runBench("bench_fp12", "clang")
 
+task bench_fp12_gcc_noasm, "Run benchmark 𝔽p12 with gcc - no Assembly":
+  runBench("bench_fp12", "gcc", useAsm = false)
+
+task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly":
+  runBench("bench_fp12", "clang", useAsm = false)
+
 task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
   runBench("bench_ec_g1")
 
@@ -334,6 +377,12 @@ task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weier
 task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang":
   runBench("bench_ec_g1", "clang")
 
+task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
+  runBench("bench_ec_g1", "gcc", useAsm = false)
+
+task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
+  runBench("bench_ec_g1", "clang", useAsm = false)
+
 task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC":
   runBench("bench_ec_g2")
 
@@ -342,3 +391,9 @@ task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weier
 
 task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang":
   runBench("bench_ec_g2", "clang")
+
+task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
+  runBench("bench_ec_g2", "gcc", useAsm = false)
+
+task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
+  runBench("bench_ec_g2", "clang", useAsm = false)