Skip to content

Commit 1920adc

Browse files
committed
Reuse stint primitives for limbs
This PR makes bncurve less slow by reusing stint integer primtivies and unrolling a few loops and arrays to avoid array length checks and the like. To give an idea, it brings down processing 8k nimbus-eth1 blocks around the 18M block height mark from 24 to 16 minutes - this is quite significant given that a lot of time in eth1 is spent reading the database - this is at least an order of magnitude of bncurve improvement but probably quite a lot more - how much doesn't greatly matter but now there's at least a decent baseline for any future performance work ;) Of course, reusing private primitives from `stint` is not pretty - the plan is to extract them to a separate library, work started in status-im/nim-stew#187.
1 parent 9c10dec commit 1920adc

File tree

6 files changed

+39
-104
lines changed

6 files changed

+39
-104
lines changed

bncurve.nimble

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ skipDirs = @["tests", "Nim", "nim"]
88
### Dependencies
99

1010
requires "nim >= 1.6.0",
11-
"nimcrypto"
11+
"nimcrypto",
12+
"stint"
1213

1314
task test, "Run all tests":
1415
for tprog in @[

bncurve/arith.nim

Lines changed: 37 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ import options, endians
1010
import nimcrypto/[utils, sysrand]
1111
export options
1212

13-
{.deadCodeElim: on.}
13+
# TODO replace private stint operations with an integer primitive library
14+
import stint/private/primitives/[addcarry_subborrow, extended_precision]
15+
import stint/private/datatypes
1416

1517
type
1618
BNU256* = array[4, uint64]
@@ -68,16 +70,7 @@ proc getBit*(a: openArray[uint64], n: int): bool {.inline, noinit.} =
6870
let bit = n - (part shl 6)
6971
result = ((a[part] and (1'u64 shl bit)) != 0)
7072

71-
template splitU64(n: uint64, hi, lo: untyped) =
72-
## Split 64bit unsigned integer to 32bit parts
73-
hi = n shr 32
74-
lo = n and 0xFFFF_FFFF'u64
75-
76-
template combineU64(hi, lo: untyped): uint64 =
77-
## Combine 64bit unsigned integer from 32bit parts
78-
(hi shl 32) or lo
79-
80-
proc div2*(a: var BNU256) {.inline.} =
73+
proc div2(a: var BNU256) {.inline.} =
8174
## Divide integer ``a`` in place by ``2``.
8275
var t = a[3] shl 63
8376
a[3] = a[3] shr 1
@@ -90,7 +83,7 @@ proc div2*(a: var BNU256) {.inline.} =
9083
a[0] = a[0] shr 1
9184
a[0] = a[0] or t
9285

93-
proc mul2*(a: var BNU256) {.inline.} =
86+
proc mul2(a: var BNU256) {.inline.} =
9487
## Multiply integer ``a`` in place by ``2``.
9588
var last = 0'u64
9689
for i in a.mitems():
@@ -99,92 +92,42 @@ proc mul2*(a: var BNU256) {.inline.} =
9992
i = i or last
10093
last = tmp
10194

102-
proc adc(a, b: uint64, carry: var uint64): uint64 {.inline, noinit.} =
103-
## Calculate ``a + b`` and return result, set ``carry`` to addition
104-
## operation carry.
105-
var a0, a1, b0, b1, c, r0, r1: uint64
106-
splitU64(a, a1, a0)
107-
splitU64(b, b1, b0)
108-
let tmp0 = a0 + b0 + carry
109-
splitU64(tmp0, c, r0)
110-
let tmp1 = a1 + b1 + c
111-
splitU64(tmp1, c, r1)
112-
carry = c
113-
result = combineU64(r1, r0)
114-
115-
proc addNoCarry*(a: var BNU256, b: BNU256) {.inline.} =
95+
proc addNoCarry(a: var BNU256, b: BNU256) {.inline.} =
11696
## Calculate integer addition ``a = a + b``.
117-
var carry = 0'u64
118-
a[0] = adc(a[0], b[0], carry)
119-
a[1] = adc(a[1], b[1], carry)
120-
a[2] = adc(a[2], b[2], carry)
121-
a[3] = adc(a[3], b[3], carry)
122-
doAssert(carry == 0)
97+
var carry: Carry
98+
staticFor i, 0, 4:
99+
addC(carry, a[i], a[i], b[i], carry)
123100

124-
proc subNoBorrow*(a: var BNU256, b: BNU256) {.inline.} =
101+
proc subNoBorrow(a: var BNU256, b: BNU256) {.inline.} =
125102
## Calculate integer substraction ``a = a - b``.
126-
proc sbb(a: uint64, b: uint64,
127-
borrow: var uint64): uint64 {.inline, noinit.}=
128-
var a0, a1, b0, b1, t0, r0, r1: uint64
129-
splitU64(a, a1, a0)
130-
splitU64(b, b1, b0)
131-
let tmp0 = (1'u64 shl 32) + a0 - b0 - borrow
132-
splitU64(tmp0, t0, r0)
133-
let tmp1 = (1'u64 shl 32) + a1 - b1 - uint64(t0 == 0'u64)
134-
splitU64(tmp1, t0, r1)
135-
borrow = uint64(t0 == 0)
136-
result = combineU64(r1, r0)
137-
var borrow = 0'u64
138-
a[0] = sbb(a[0], b[0], borrow)
139-
a[1] = sbb(a[1], b[1], borrow)
140-
a[2] = sbb(a[2], b[2], borrow)
141-
a[3] = sbb(a[3], b[3], borrow)
142-
doAssert(borrow == 0)
143-
144-
proc macDigit(acc: var openArray[uint64], pos: int, b: openArray[uint64],
145-
c: uint64) =
146-
proc macWithCarry(a, b, c: uint64, carry: var uint64): uint64 {.noinit.} =
147-
var
148-
bhi, blo, chi, clo, ahi, alo, carryhi, carrylo: uint64
149-
xhi, xlo, yhi, ylo, zhi, zlo, rhi, rlo: uint64
150-
splitU64(b, bhi, blo)
151-
splitU64(c, chi, clo)
152-
splitU64(a, ahi, alo)
153-
splitU64(carry, carryhi, carrylo)
154-
splitU64(blo * clo + alo + carrylo, xhi, xlo)
155-
splitU64(blo * chi, yhi, ylo)
156-
splitU64(bhi * clo, zhi, zlo)
157-
splitU64(xhi + ylo + zlo + ahi + carryhi, rhi, rlo)
158-
carry = (bhi * chi) + rhi + yhi + zhi
159-
result = combineU64(rlo, xlo)
103+
var borrow: Borrow
104+
staticFor i, 0, 4:
105+
subB(borrow, a[i], a[i], b[i], borrow)
160106

107+
proc macDigit[N, N2: static int](
108+
acc: var array[N, uint64], pos: static int, b: array[N2, uint64], c: uint64) =
161109
if c == 0'u64:
162110
return
111+
163112
var carry = 0'u64
164-
for i in pos..<len(acc):
165-
if (i - pos) < len(b):
166-
acc[i] = macWithCarry(acc[i], b[i - pos], c, carry)
167-
elif carry != 0:
168-
acc[i] = macWithCarry(acc[i], 0'u64, c, carry)
113+
114+
staticFor i, pos, N:
115+
when (i - pos) < len(b):
116+
muladd2(carry, acc[i], b[i-pos], c, acc[i], carry)
169117
else:
170-
break
171-
doAssert(carry == 0)
118+
muladd2(carry, acc[i], 0, c, acc[i], carry)
172119

173-
proc mulReduce(a: var BNU256, by: BNU256, modulus: BNU256,
174-
inv: uint64) =
120+
proc mulReduce(a: var BNU256, by: BNU256, modulus: BNU256, inv: uint64) =
175121
var res: array[4 * 2, uint64]
176-
var k: uint64
177-
macDigit(res, 0, by, a[0])
178-
macDigit(res, 1, by, a[1])
179-
macDigit(res, 2, by, a[2])
180-
macDigit(res, 3, by, a[3])
181-
for i in 0..<4:
182-
k = inv * res[i]
122+
staticFor i, 0, 4:
123+
macDigit(res, i, by, a[i])
124+
125+
staticFor i, 0, 4:
126+
let k = inv * res[i]
183127
macDigit(res, i, modulus, k)
184-
a[0] = res[4]
185-
a[1] = res[5]
186-
a[2] = res[6]
187-
a[3] = res[7]
128+
129+
staticFor i, 0, 4:
130+
a[i] = res[i + 4]
188131

189132
proc compare*(a: BNU256, b: BNU256): int {.noinit, inline.}=
190133
## Compare integers ``a`` and ``b``.
@@ -267,15 +210,14 @@ proc into*(t: typedesc[BNU512], c1: BNU256,
267210
macDigit(result, 1, modulo, c1[1])
268211
macDigit(result, 2, modulo, c1[2])
269212
macDigit(result, 3, modulo, c1[3])
270-
var carry = 0'u64
271-
for i in 0..<len(result):
272-
if len(c0) > i:
273-
result[i] = adc(result[i], c0[i], carry)
274-
elif carry != 0'u64:
275-
result[i] = adc(result[i], 0'u64, carry)
213+
var carry: Carry
214+
staticFor i, 0, len(result):
215+
when len(c0) > i:
216+
addC(carry, result[i], result[i], c0[i], carry)
276217
else:
277-
break
278-
doAssert(carry == 0'u64)
218+
addC(carry, result[i], result[i], 0'u64, carry)
219+
220+
doAssert(carry == 0)
279221

280222
proc fromBytes*(dst: var BNU256, src: openArray[byte]): bool =
281223
## Create 256bit integer from big-endian bytes representation ``src``.

bncurve/fp.nim

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
# those terms.
99
import arith, options
1010

11-
{.deadCodeElim: on.}
12-
1311
template fieldImplementation(finame, fimodulus, firsquared, fircubed,
1412
fionep, fiinv: untyped): untyped {.dirty.} =
1513
type finame* = distinct BNU256

bncurve/fq12.nim

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import options
1010
import fq6, fq2, fp, arith
1111

12-
{.deadCodeElim: on.}
13-
1412
const frobeniusCoeffsC1: array[4, FQ2] = [
1513
FQ2.one(),
1614
FQ2(

bncurve/fq2.nim

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import options
1010
import fp, arith
1111

12-
{.deadCodeElim: on.}
13-
1412
type
1513
FQ2* = object
1614
c0*: FQ

bncurve/fq6.nim

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import options
1010
import fq2, fp, arith
1111

12-
{.deadCodeElim: on.}
13-
1412
const frobeniusCoeffsC1: array[4, FQ2] = [
1513
FQ2.one(),
1614
FQ2(

0 commit comments

Comments
 (0)