Skip to content

Commit 0a9224f

Browse files
authored
Nim/solution_3: tune stride8 techniques and alignment (#747)
1 parent 54bf211 commit 0a9224f

File tree

2 files changed

+45
-34
lines changed

2 files changed

+45
-34
lines changed

PrimeNim/solution_3/Primes.nim

+37-26
Original file line numberDiff line numberDiff line change
@@ -177,27 +177,31 @@ macro denseSetBits(bitbufa, bytelen, ndx0, step: untyped) = # ndx0 must be var
177177
type BitSeq = ref object
178178
size: Natural
179179
buffer: seq[byte]
180+
bufferp: ptr UncheckedArray[byte]
180181

181-
func newBitSeq(size: int): BitSeq = # round up to even 64-bit size
182-
BitSeq(size: size, buffer: newSeq[byte](((size - 1 + 64) shr 3) and (-8)))
182+
func newBitSeq(size: int): BitSeq =
183+
# round up to even 256-bit size + 256 bits in bytes...
184+
let sq = newSeq[byte](((size - 1 + 512) shr 3) and (-32))
185+
let sqpi = (cast[int](sq[0].unsafeAddr) + 31) and (-32)
186+
BitSeq(size: size, buffer: sq, bufferp: cast[ptr UncheckedArray[byte]](sqpi))
183187

184188
func `[]`(bitseq: BitSeq; i: int): bool {.inline.} =
185-
(bitseq.buffer[i shr 3] and BITMASK[i and 7]) != 0'u8
189+
(bitseq.bufferp[i shr 3] and BITMASK[i and 7]) != 0'u8
186190

187191
func `[]`(bitseq: BitSeq; startstop: HSlice[int, int];
188192
step: int = 1): iterator: bool {.closure.} {.inline.} =
189193
assert step <= 0 or startstop.b < startstop.a,
190194
"Error: illegal slice limits or step size!!"
191195
return iterator: bool {.closure.} =
192196
for i in countup(startstop.a, startstop.b, step):
193-
yield (bitseq.buffer[i shr 3] and BITMASK[i and 7]) != 0
197+
yield (bitseq.bufferp[i shr 3] and BITMASK[i and 7]) != 0
194198

195199
# sets a range of the BitSeq by step size to true and returns the next index...
196200
func setRange(bitseq: BitSeq; start, stop: int; step: int = 1,
197201
hybrid: bool = false) =
198202
assert step <= 0 or stop < start or stop > bitseq.size,
199203
"Error: illegal slice limits or step size!!!"
200-
let bitbufa = cast[int](bitseq.buffer[0].addr)
204+
let bitbufa = cast[int](bitseq.bufferp)
201205
var ndx = start
202206
let sz = min(bitseq.buffer.len, (stop + 8) shr 3) # round up
203207
if start <= sz * 8 - 16 * step: # enough loops to be worth the setup
@@ -209,7 +213,7 @@ func setRange(bitseq: BitSeq; start, stop: int; step: int = 1,
209213
ndx += step
210214

211215
func countTruesTo(bitseq: BitSeq; index: int): int =
212-
let bsp = cast[ptr UncheckedArray[uint64]](bitseq.buffer[0].addr)
216+
let bsp = cast[ptr UncheckedArray[uint64]](bitseq.bufferp)
213217
let lstwrd = index shr 6
214218
let mask = not ((0'u64 - 2'u64) shl (index and 63))
215219
result = 0
@@ -229,8 +233,7 @@ func newPrimeSieve(lmt: Prime; tec: Techniques): PrimeSieve =
229233
result = PrimeSieve(limit: lmt, # BitSeq size rounded up to nearest uint64...
230234
sievebuffer: newBitSeq((bitlmt + 64) and (-63)))
231235

232-
let cmpstsBytesp =
233-
cast[ptr UncheckedArray[byte]](result.sievebuffer.buffer[0].addr)
236+
let cmpstsBytesp = result.sieveBuffer.bufferp
234237
let starts = newSeq[int](8)
235238
let startsp = cast[ptr UncheckedArray[int]](starts[0].unsafeAddr)
236239

@@ -242,37 +245,45 @@ func newPrimeSieve(lmt: Prime; tec: Techniques): PrimeSieve =
242245

243246
case tec:
244247
of bittwiddle:
245-
let cmpstsBytesp =
246-
cast[ptr UncheckedArray[byte]](result.sievebuffer.buffer[0].addr)
247248
while cullIndex <= bitlmt:
248249
let byteIndex = cullIndex shr 3
249250
cmpstsBytesp[byteIndex] =
250251
cmpstsBytesp[byteIndex] or BITMASK[cullIndex and 7]
251252
cullIndex += basePrime
252253

253254
of stride8:
254-
let slmt = min(bitlmt, cullIndex + (basePrime shl 3) - 1)
255-
let byteLimit = bitlmt shr 3
255+
let bufa = cast[int](cmpstsBytesp)
256+
let bytealmt = bufa + (bitlmt shr 3)
257+
let slmt = min(bitlmt, cullIndex + (basePrime shl 3) - 1)
256258
while cullIndex <= slmt:
257-
let mask = BITMASK[cullIndex and 7]; var byteIndex = cullIndex shr 3
258-
while byteIndex <= byteLimit:
259-
let cp = cast[ptr byte](cmpstsBytesp[byteIndex].unsafeAddr)
260-
cp[] = cp[] or mask; byteIndex += basePrime
259+
let mask = BITMASK[cullIndex and 7]
260+
var bytepa = bufa + (cullIndex shr 3)
261+
while bytepa <= bytealmt:
262+
let cp = cast[ptr byte](bytepa)
263+
cp[] = cp[] or mask; bytepa += basePrime
261264
cullIndex += basePrime
262265

263266
of stride8block:
264-
let startIndex = cullIndex shr 3; let bytelimit = bitlmt shr 3
267+
let bufa = cast[int](cmpstsBytesp); let bufalmt = bufa + (bitlmt shr 3)
268+
let starta = bufa + ((cullIndex shr 3) and (-CPUL1CACHE))
269+
let bp2 = basePrime + basePrime; let bp3 = basePrime + bp2
270+
let bp4 = basePrime + bp3
265271
for _ in 0 .. 7:
266-
startsp[cullIndex and 7] = cullIndex shr 3; cullIndex += basePrime
267-
for pageIndex in countup(startIndex and (-CPUL1CACHE),
268-
byteLimit, CPUL1CACHE):
269-
let pageLimit = min(byteLimit, pageIndex + CPUL1CACHE - 1)
272+
startsp[cullIndex and 7] = bufa + (cullIndex shr 3)
273+
cullIndex += basePrime
274+
for pagea in countup(starta, bufalmt, CPUL1CACHE):
275+
let pageLimit = min(bufalmt, pagea + CPUL1CACHE - 1)
270276
for si in 0 .. 7:
271-
let mask = BITMASK[si]; var byteIndex = startsp[si]
272-
while byteIndex <= pageLimit:
273-
let cp = cast[ptr byte](cmpstsBytesp[byteIndex].unsafeAddr)
274-
cp[] = cp[] or mask; byteIndex += basePrime
275-
startsp[si] = byteIndex
277+
let mask = BITMASK[si]; var bytea = startsp[si]
278+
while bytea <= pageLimit - bp3:
279+
let cp = cast[ptr UncheckedArray[byte]](bytea)
280+
cp[0] = cp[0] or mask; cp[basePrime] = cp[basePrime] or mask
281+
cp[bp2] = cp[bp2] or mask; cp[bp3] = cp[bp3] or mask
282+
bytea += bp4
283+
while bytea <= pageLimit:
284+
let cp = cast[ptr byte](bytea)
285+
cp[] = cp[] or mask; bytea += basePrime
286+
startsp[si] = bytea
276287

277288
of extreme:
278289
result.sieveBuffer.setRange(cullIndex, bitlmt, step = basePrime)

PrimeNim/solution_3/README.md

+8-8
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ Running locally on my Intel SkyLake i5-6500 at 3.6 GHz when single threaded, I g
5353
```
5454
Passes: 8549, Time: 5.00003205, Avg: 0.0005848674757281553, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
5555
GordonBGood_bittwiddle;8549;5.00003205;1;algorithm=base,faithful=yes,bits=1
56-
Passes: 10963, Time: 5.000370107, Avg: 0.0004561132999179057, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
57-
GordonBGood_stride8;10963;5.000370107;1;algorithm=base,faithful=yes,bits=1
58-
Passes: 11630, Time: 5.00007037, Avg: 0.0004299286646603612, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
59-
GordonBGood_stride8block-16K;11630;5.00007037;1;algorithm=base,faithful=yes,bits=1
56+
Passes: 12179, Time: 5.000142803, Avg: 0.0004105544628458823, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
57+
GordonBGood_stride8;12179;5.000142803;1;algorithm=base,faithful=yes,bits=1
58+
Passes: 15481, Time: 5.000130019, Avg: 0.0003229849505199923, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
59+
GordonBGood_stride8block-16K;15481;5.000130019;1;algorithm=base,faithful=yes,bits=1
6060
Passes: 18332, Time: 5.000006381, Avg: 0.0002727474569605062, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
6161
GordonBGood_extreme;18332;5.000006381;1;algorithm=base,faithful=yes,bits=1
6262
Passes: 44094, Time: 5.000062015, Avg: 0.0001133955190048533, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
@@ -70,10 +70,10 @@ Which matches the results when run with Docker on the same machine as follows:
7070
│ Index │ Implementation │ Solution │ Label │ Passes │ Duration │ Threads │ Algorithm │ Faithful │ Bits │ Passes/Second │
7171
├───────┼────────────────┼──────────┼──────────────────────────────┼────────┼──────────┼─────────┼───────────┼──────────┼──────┼───────────────┤
7272
│ 1 │ nim │ 3 │ GordonBGood_extreme-hybrid │ 43730 │ 5.00014 │ 1 │ base │ yes │ 1 │ 8745.75303 │
73-
│ 2 │ nim │ 3 │ GordonBGood_extreme │ 18327 │ 5.00002 │ 1 │ base │ yes │ 1 │ 3665.38708
74-
│ 3 │ nim │ 3 │ GordonBGood_stride8block-16K │ 11571 │ 5.00013 │ 1 │ base │ yes │ 1 │ 2314.13979
75-
│ 4 │ nim │ 3 │ GordonBGood_stride8 │ 11010 │ 5.00045 │ 1 │ base │ yes │ 1 │ 2201.80349
76-
│ 5 │ nim │ 3 │ GordonBGood_bittwiddle │ 8641 │ 5.00048 │ 1 │ base │ yes │ 1 │ 1728.03354
73+
│ 2 │ nim │ 3 │ GordonBGood_extreme │ 18115 │ 5.00016 │ 1 │ base │ yes │ 1 │ 3622.88491
74+
│ 3 │ nim │ 3 │ GordonBGood_stride8block-16K │ 15389 │ 5.00014 │ 1 │ base │ yes │ 1 │ 3077.71376
75+
│ 4 │ nim │ 3 │ GordonBGood_stride8 │ 12196 │ 5.00025 │ 1 │ base │ yes │ 1 │ 2439.07929
76+
│ 5 │ nim │ 3 │ GordonBGood_bittwiddle │ 8480 │ 5.00022 │ 1 │ base │ yes │ 1 │ 1695.92701
7777
└───────┴────────────────┴──────────┴──────────────────────────────┴────────┴──────────┴─────────┴───────────┴──────────┴──────┴───────────────┘
7878
```
7979

0 commit comments

Comments
 (0)