Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0aabd6e
Implement double-width field multiplication for double-width towering
mratsim Jul 25, 2020
8ad4c0f
Fp2 mul acceleration via double-width lazy reduction (pure Nim)
mratsim Jul 25, 2020
5e18ecc
Inline assembly for basic add and sub
mratsim Jul 25, 2020
268172d
Use 2 registers instead of 12+ for ASM conditional copy
mratsim Jul 26, 2020
07f9475
Prepare assembly for extended multiprecision multiplication support
mratsim Jul 26, 2020
4b7ba2f
Add assembly for mul
mratsim Jul 27, 2020
eda83de
initial implementation of assembly reduction
mratsim Jul 27, 2020
2195b9c
stash current progress of assembly reduction
mratsim Aug 13, 2020
f7b9943
Fix clobbering issue, only P256 comparison remain buggy
mratsim Aug 16, 2020
fa3d094
Fix asm montgomery reduction for NIST P256 as well
mratsim Aug 16, 2020
90255ca
MULX/ADCX/ADOX multi-precision multiplication
mratsim Aug 17, 2020
403f883
MULX/ADCX/ADOX reduction v1
mratsim Aug 17, 2020
c9d3076
Add (deactivated) assembly for double-width substraction + rework ben…
mratsim Aug 17, 2020
594fcf2
Add bench to nimble and deactivate double-width for now. slower than …
mratsim Aug 18, 2020
9b59606
Fix x86-32 running out of registers for mul
mratsim Aug 18, 2020
eacec57
Clang needs to be at v9 to support flag output constraints (Xcode 11.…
mratsim Aug 18, 2020
4c2a571
32-bit doesn't have enough registers for ASM mul
mratsim Aug 19, 2020
37473c7
Fix again Travis Clang 9 issues
mratsim Aug 19, 2020
4fea06b
LLVM 9 is not whitelisted in travis
mratsim Aug 19, 2020
e19783a
deactivated assembler with travis clang
mratsim Aug 19, 2020
5db4fa4
syntax error
mratsim Aug 19, 2020
1f3cae1
another
mratsim Aug 20, 2020
f078e99
...
mratsim Aug 20, 2020
fd9b60c
missing space, yeah ...
mratsim Aug 20, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ matrix:
# Build and test on both x86-64 and ARM64
# Ubuntu Bionic (18.04) is needed, it includes
# GCC 7 codegen fixes to addcarry_u64.
# Clang 9 (and GCC-6) are needed for inline assembly "flag output constraints"
- dist: bionic
arch: amd64
env:
Expand All @@ -33,9 +34,19 @@ matrix:
- ARCH=amd64
- CHANNEL=devel
compiler: clang
# addons:
# apt:
# sources:
# - ubuntu-toolchain-r-test
# - llvm-toolchain-bionic-9.0 # LLVM 9 repo is disallowed
# packages:
# - clang-9.0
# env:
# - MATRIX_EVAL="CC=clang-9.0 && CXX=clang++-9.0"

# On OSX we only test against clang (gcc is mapped to clang by default)
- os: osx
osx_image: xcode11.5 # Need xcode 11.4.2 min for Clang 9
arch: amd64
env:
- ARCH=amd64
Expand Down Expand Up @@ -98,9 +109,17 @@ before_script:
script:
- nimble refresh
- nimble install gmp stew
- nimble test_parallel
- if [[ "$ARCH" != "arm64" ]]; then
nimble test_parallel_no_assembler;
# Installing Clang9.0 or later is a pain in Travis
# for inline assembly "flag output constraint"
# Also MacOS build is timing out with 2 series of tests.
- |
if [[ "$TRAVIS_COMPILER" == "clang" ]]; then
nimble test_parallel_no_assembler
else
nimble test_parallel
if [[ "$ARCH" != "arm64" ]]; then
nimble test_parallel_no_assembler
fi
fi
branches:
except:
Expand Down
2 changes: 1 addition & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ strategy:
# TEST_LANG: c

MacOS_devel_64bit:
VM: 'macOS-10.14'
VM: 'macOS-10.15'
UCPU: amd64
CHANNEL: devel
TEST_LANG: c
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_elliptic_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ echo "Optimization level => "
echo " no optimization: ", not defined(release)
echo " release: ", defined(release)
echo " danger: ", defined(danger)
echo " inline assembly: ", UseX86ASM
echo " inline assembly: ", UseASM_X86_64

when (sizeof(int) == 4) or defined(Constantine32):
echo "⚠️ Warning: using Constantine with 32-bit limbs"
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_fields_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ echo "Optimization level => "
echo " no optimization: ", not defined(release)
echo " release: ", defined(release)
echo " danger: ", defined(danger)
echo " inline assembly: ", UseX86ASM
echo " inline assembly: ", UseASM_X86_64

when (sizeof(int) == 4) or defined(Constantine32):
echo "⚠️ Warning: using Constantine with 32-bit limbs"
Expand Down
196 changes: 196 additions & 0 deletions benchmarks/bench_fp_double_width.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# ############################################################
#
# Benchmark of finite fields
#
# ############################################################

import
# Internals
../constantine/config/[curves, common],
../constantine/arithmetic,
../constantine/towers,
# Helpers
../helpers/[prng_unsafe, static_for],
./platforms,
# Standard library
std/[monotimes, times, strformat, strutils, macros]

var rng: RngState
let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
rng.seed(seed)
echo "bench xoshiro512** seed: ", seed

# warmup
proc warmup*() =
# Warmup - make sure cpu is on max perf
let start = cpuTime()
var foo = 123
for i in 0 ..< 300_000_000:
foo += i*i mod 456
foo = foo mod 789

# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
let stop = cpuTime()
echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"

warmup()

when defined(gcc):
echo "\nCompiled with GCC"
elif defined(clang):
echo "\nCompiled with Clang"
elif defined(vcc):
echo "\nCompiled with MSVC"
elif defined(icc):
echo "\nCompiled with ICC"
else:
echo "\nCompiled with an unknown compiler"

echo "Optimization level => "
echo " no optimization: ", not defined(release)
echo " release: ", defined(release)
echo " danger: ", defined(danger)
echo " inline assembly: ", UseASM_X86_64

when (sizeof(int) == 4) or defined(Constantine32):
echo "⚠️ Warning: using Constantine with 32-bit limbs"
else:
echo "Using Constantine with 64-bit limbs"

when SupportsCPUName:
echo "Running on ", cpuName(), ""

when SupportsGetTicks:
echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"

echo "\n=================================================================================================================\n"

proc separator*() =
echo "-".repeat(145)

proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
let ns = inNanoseconds((stop-start) div iters)
let throughput = 1e9 / float64(ns)
when SupportsGetTicks:
echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
else:
echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op"

proc notes*() =
echo "Notes:"
echo " - Compilers:"
echo " Compilers are severely limited on multiprecision arithmetic."
echo " Inline Assembly is used by default (nimble bench_fp)."
echo " Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
echo " - The simplest operations might be optimized away by the compiler."
echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"

template bench(op: string, desc: string, iters: int, body: untyped): untyped =
let start = getMonotime()
when SupportsGetTicks:
let startClk = getTicks()
for _ in 0 ..< iters:
body
when SupportsGetTicks:
let stopClk = getTicks()
let stop = getMonotime()

when not SupportsGetTicks:
let startClk = -1'i64
let stopClk = -1'i64

report(op, desc, start, stop, startClk, stopClk, iters)

func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
## Initialize a standalone Double-Width field element
## we don't reduce it modulo p², this is only used for benchmark
let aHi = rng.random_unsafe(Base)
let aLo = rng.random_unsafe(Base)
for i in 0 ..< aLo.mres.limbs.len:
a.limbs2x[i] = aLo.mres.limbs[i]
for i in 0 ..< aHi.mres.limbs.len:
a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]

proc sumNoReduce(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Addition no reduce", $T, iters):
r.sumNoReduce(a, b)

proc sum(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Addition", $T, iters):
r.sum(a, b)

proc diffNoReduce(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Substraction no reduce", $T, iters):
r.diffNoReduce(a, b)

proc diff(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Substraction", $T, iters):
r.diff(a, b)

proc diff2xNoReduce(T: typedesc, iters: int) =
var r, a, b: doubleWidth(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Substraction 2x no reduce", $doubleWidth(T), iters):
r.diffNoReduce(a, b)

proc diff2x(T: typedesc, iters: int) =
var r, a, b: doubleWidth(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Substraction 2x", $doubleWidth(T), iters):
r.diff(a, b)

proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
var r: BigInt[rLen]
let a = rng.random_unsafe(BigInt[aLen])
let b = rng.random_unsafe(BigInt[bLen])
bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
r.prod(a, b)

proc reduce2x*(T: typedesc, iters: int) =
var r: T
var t: doubleWidth(T)
rng.random_unsafe(t, T)

bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):
r.reduce(t)

proc main() =
separator()
sumNoReduce(Fp[BLS12_381], iters = 10_000_000)
diffNoReduce(Fp[BLS12_381], iters = 10_000_000)
sum(Fp[BLS12_381], iters = 10_000_000)
diff(Fp[BLS12_381], iters = 10_000_000)
diff2x(Fp[BLS12_381], iters = 10_000_000)
diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
mul2xBench(768, 384, 384, iters = 10_000_000)
reduce2x(Fp[BLS12_381], iters = 10_000_000)
separator()

main()
notes()
59 changes: 57 additions & 2 deletions constantine.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/t_finite_fields_vs_gmp.nim", true),
# Precompute
("tests/t_precomputed", false),
# Double-width finite fields
("tests/t_finite_fields_double_width.nim", false),
# Towers of extension fields
("tests/t_fp2.nim", false),
("tests/t_fp2_sqrt.nim", false),
Expand Down Expand Up @@ -100,13 +102,15 @@ proc test(flags, path: string, commandFile = false) =
# commandFile.writeLine command
exec "echo \'" & command & "\' >> " & buildParallel

proc runBench(benchName: string, compiler = "") =
proc runBench(benchName: string, compiler = "", useAsm = true) =
if not dirExists "build":
mkDir "build"

var cc = ""
if compiler != "":
cc = "--cc:" & compiler & " -d:ConstantineASM=false"
cc = "--cc:" & compiler
if not useAsm:
cc &= " -d:ConstantineASM=false"
exec "nim c " & cc &
" -d:danger --verbosity:0 -o:build/" & benchName & "_" & compiler &
" -r --hints:off --warnings:off benchmarks/" & benchName & ".nim"
Expand Down Expand Up @@ -298,6 +302,27 @@ task bench_fp_gcc, "Run benchmark 𝔽p with gcc":
task bench_fp_clang, "Run benchmark 𝔽p with clang":
runBench("bench_fp", "clang")

task bench_fp_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
runBench("bench_fp", "gcc", useAsm = false)

task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
runBench("bench_fp", "clang", useAsm = false)

task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
runBench("bench_fp_double_width")

task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
runBench("bench_fp_double_width", "gcc")

task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
runBench("bench_fp_double_width", "clang")

task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
runBench("bench_fp_double_width", "gcc", useAsm = false)

task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
runBench("bench_fp_double_width", "clang", useAsm = false)

task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
runBench("bench_fp2")

Expand All @@ -307,6 +332,12 @@ task bench_fp2_gcc, "Run benchmark 𝔽p2 with gcc":
task bench_fp2_clang, "Run benchmark 𝔽p2 with clang":
runBench("bench_fp2", "clang")

task bench_fp2_gcc_noasm, "Run benchmark 𝔽p2 with gcc - no Assembly":
runBench("bench_fp2", "gcc", useAsm = false)

task bench_fp2_clang_noasm, "Run benchmark 𝔽p2 with clang - no Assembly":
runBench("bench_fp2", "clang", useAsm = false)

task bench_fp6, "Run benchmark with 𝔽p6 your default compiler":
runBench("bench_fp6")

Expand All @@ -316,6 +347,12 @@ task bench_fp6_gcc, "Run benchmark 𝔽p6 with gcc":
task bench_fp6_clang, "Run benchmark 𝔽p6 with clang":
runBench("bench_fp6", "clang")

task bench_fp6_gcc_noasm, "Run benchmark 𝔽p6 with gcc - no Assembly":
runBench("bench_fp6", "gcc", useAsm = false)

task bench_fp6_clang_noasm, "Run benchmark 𝔽p6 with clang - no Assembly":
runBench("bench_fp6", "clang", useAsm = false)

task bench_fp12, "Run benchmark with 𝔽p12 your default compiler":
runBench("bench_fp12")

Expand All @@ -325,6 +362,12 @@ task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc":
task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
runBench("bench_fp12", "clang")

task bench_fp12_gcc_noasm, "Run benchmark 𝔽p12 with gcc - no Assembly":
runBench("bench_fp12", "gcc", useAsm = false)

task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly":
runBench("bench_fp12", "clang", useAsm = false)

task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
runBench("bench_ec_g1")

Expand All @@ -334,6 +377,12 @@ task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weier
task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang":
runBench("bench_ec_g1", "clang")

task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
runBench("bench_ec_g1", "gcc", useAsm = false)

task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
runBench("bench_ec_g1", "clang", useAsm = false)

task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC":
runBench("bench_ec_g2")

Expand All @@ -342,3 +391,9 @@ task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weier

task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang":
runBench("bench_ec_g2", "clang")

task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
runBench("bench_ec_g2", "gcc", useAsm = false)

task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
runBench("bench_ec_g2", "clang", useAsm = false)
Loading