Nim/solution_3: tune stride8 techniques and alignment (#747)

GordonBGood · web-flow · commit 0a9224f577da · 2021-10-09T17:00:18.000+02:00
diff --git a/PrimeNim/solution_3/Primes.nim b/PrimeNim/solution_3/Primes.nim
@@ -177,27 +177,31 @@ macro denseSetBits(bitbufa, bytelen, ndx0, step: untyped) = # ndx0 must be var
 type BitSeq = ref object
   size: Natural
   buffer: seq[byte]
+  bufferp: ptr UncheckedArray[byte]
 
-func newBitSeq(size: int): BitSeq = # round up to even 64-bit size
-  BitSeq(size: size, buffer: newSeq[byte](((size - 1 + 64) shr 3) and (-8)))
+func newBitSeq(size: int): BitSeq =
+  # round up to even 256-bit size + 256 bits in bytes...
+  let sq = newSeq[byte](((size - 1 + 512) shr 3) and (-32))
+  let sqpi = (cast[int](sq[0].unsafeAddr) + 31) and (-32)
+  BitSeq(size: size, buffer: sq, bufferp: cast[ptr UncheckedArray[byte]](sqpi))
 
 func `[]`(bitseq: BitSeq; i: int): bool {.inline.} =
-  (bitseq.buffer[i shr 3] and BITMASK[i and 7]) != 0'u8
+  (bitseq.bufferp[i shr 3] and BITMASK[i and 7]) != 0'u8
 
 func `[]`(bitseq: BitSeq; startstop: HSlice[int, int];
           step: int = 1): iterator: bool {.closure.} {.inline.} =
   assert step <= 0 or startstop.b < startstop.a,
          "Error:  illegal slice limits or step size!!"
   return iterator: bool {.closure.} =
     for i in countup(startstop.a, startstop.b, step):
-      yield (bitseq.buffer[i shr 3] and BITMASK[i and 7]) != 0
+      yield (bitseq.bufferp[i shr 3] and BITMASK[i and 7]) != 0
 
 # sets a range of the BitSeq by step size to true and returns the next index...
 func setRange(bitseq: BitSeq; start, stop: int; step: int = 1,
               hybrid: bool = false) =
   assert step <= 0 or stop < start or stop > bitseq.size,
          "Error:  illegal slice limits or step size!!!"
-  let bitbufa = cast[int](bitseq.buffer[0].addr)
+  let bitbufa = cast[int](bitseq.bufferp)
   var ndx = start
   let sz = min(bitseq.buffer.len, (stop + 8) shr 3) # round up
   if start <= sz * 8 - 16 * step: # enough loops to be worth the setup
@@ -209,7 +213,7 @@ func setRange(bitseq: BitSeq; start, stop: int; step: int = 1,
     ndx += step
 
 func countTruesTo(bitseq: BitSeq; index: int): int =
-  let bsp = cast[ptr UncheckedArray[uint64]](bitseq.buffer[0].addr)
+  let bsp = cast[ptr UncheckedArray[uint64]](bitseq.bufferp)
   let lstwrd = index shr 6
   let mask = not ((0'u64 - 2'u64) shl (index and 63))
   result = 0
@@ -229,8 +233,7 @@ func newPrimeSieve(lmt: Prime; tec: Techniques): PrimeSieve =
   result = PrimeSieve(limit: lmt, # BitSeq size rounded up to nearest uint64...
                       sievebuffer: newBitSeq((bitlmt + 64) and (-63)))
 
-  let cmpstsBytesp =
-        cast[ptr UncheckedArray[byte]](result.sievebuffer.buffer[0].addr)
+  let cmpstsBytesp = result.sieveBuffer.bufferp
   let starts = newSeq[int](8)
   let startsp = cast[ptr UncheckedArray[int]](starts[0].unsafeAddr)
   
@@ -242,37 +245,45 @@ func newPrimeSieve(lmt: Prime; tec: Techniques): PrimeSieve =
  
     case tec:
       of bittwiddle:
-        let cmpstsBytesp =
-              cast[ptr UncheckedArray[byte]](result.sievebuffer.buffer[0].addr)
         while cullIndex <= bitlmt:
           let byteIndex = cullIndex shr 3
           cmpstsBytesp[byteIndex] =
             cmpstsBytesp[byteIndex] or BITMASK[cullIndex and 7]
           cullIndex += basePrime
       
       of stride8:
-        let slmt = min(bitlmt, cullIndex + (basePrime shl 3) - 1)
-        let byteLimit = bitlmt shr 3
+        let bufa = cast[int](cmpstsBytesp)
+        let bytealmt = bufa + (bitlmt shr 3)
+        let slmt = min(bitlmt, cullIndex + (basePrime shl 3) - 1)       
         while cullIndex <= slmt:
-          let mask = BITMASK[cullIndex and 7]; var byteIndex = cullIndex shr 3
-          while byteIndex <= byteLimit:
-            let cp = cast[ptr byte](cmpstsBytesp[byteIndex].unsafeAddr)
-            cp[] = cp[] or mask; byteIndex += basePrime
+          let mask = BITMASK[cullIndex and 7]
+          var bytepa = bufa + (cullIndex shr 3)
+          while bytepa <= bytealmt:
+            let cp = cast[ptr byte](bytepa)
+            cp[] = cp[] or mask; bytepa += basePrime
           cullIndex += basePrime
       
       of stride8block:
-        let startIndex = cullIndex shr 3; let bytelimit = bitlmt shr 3
+        let bufa = cast[int](cmpstsBytesp); let bufalmt = bufa + (bitlmt shr 3)
+        let starta = bufa + ((cullIndex shr 3) and (-CPUL1CACHE))
+        let bp2 = basePrime + basePrime; let bp3 = basePrime + bp2
+        let bp4 = basePrime + bp3
         for _ in 0 .. 7:
-          startsp[cullIndex and 7] = cullIndex shr 3; cullIndex += basePrime
-        for pageIndex in countup(startIndex and (-CPUL1CACHE),
-                                    byteLimit, CPUL1CACHE):
-          let pageLimit = min(byteLimit, pageIndex + CPUL1CACHE - 1)
+          startsp[cullIndex and 7] = bufa + (cullIndex shr 3)
+          cullIndex += basePrime
+        for pagea in countup(starta, bufalmt, CPUL1CACHE):
+          let pageLimit = min(bufalmt, pagea + CPUL1CACHE - 1)
           for si in 0 .. 7:
-            let mask = BITMASK[si]; var byteIndex = startsp[si]
-            while byteIndex <= pageLimit:
-              let cp = cast[ptr byte](cmpstsBytesp[byteIndex].unsafeAddr)
-              cp[] = cp[] or mask; byteIndex += basePrime
-            startsp[si] = byteIndex
+            let mask = BITMASK[si]; var bytea = startsp[si]
+            while bytea <= pageLimit - bp3:
+              let cp = cast[ptr UncheckedArray[byte]](bytea)
+              cp[0] = cp[0] or mask; cp[basePrime] = cp[basePrime] or mask
+              cp[bp2] = cp[bp2] or mask; cp[bp3] = cp[bp3] or mask
+              bytea += bp4
+            while bytea <= pageLimit:
+              let cp = cast[ptr byte](bytea)
+              cp[] = cp[] or mask; bytea += basePrime
+            startsp[si] = bytea
       
       of extreme:
         result.sieveBuffer.setRange(cullIndex, bitlmt, step = basePrime)
diff --git a/PrimeNim/solution_3/README.md b/PrimeNim/solution_3/README.md
@@ -53,10 +53,10 @@ Running locally on my Intel SkyLake i5-6500 at 3.6 GHz when single threaded, I g
 ```
 Passes: 8549, Time: 5.00003205, Avg: 0.0005848674757281553, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
 GordonBGood_bittwiddle;8549;5.00003205;1;algorithm=base,faithful=yes,bits=1
-Passes: 10963, Time: 5.000370107, Avg: 0.0004561132999179057, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
-GordonBGood_stride8;10963;5.000370107;1;algorithm=base,faithful=yes,bits=1
-Passes: 11630, Time: 5.00007037, Avg: 0.0004299286646603612, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
-GordonBGood_stride8block-16K;11630;5.00007037;1;algorithm=base,faithful=yes,bits=1
+Passes: 12179, Time: 5.000142803, Avg: 0.0004105544628458823, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
+GordonBGood_stride8;12179;5.000142803;1;algorithm=base,faithful=yes,bits=1
+Passes: 15481, Time: 5.000130019, Avg: 0.0003229849505199923, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
+GordonBGood_stride8block-16K;15481;5.000130019;1;algorithm=base,faithful=yes,bits=1
 Passes: 18332, Time: 5.000006381, Avg: 0.0002727474569605062, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
 GordonBGood_extreme;18332;5.000006381;1;algorithm=base,faithful=yes,bits=1
 Passes: 44094, Time: 5.000062015, Avg: 0.0001133955190048533, Limit: 1000000, Count1: 78498, Count2: 78498, Valid: true
@@ -70,10 +70,10 @@ Which matches the results when run with Docker on the same machine as follows:
 │ Index │ Implementation │ Solution │ Label                        │ Passes │ Duration │ Threads │ Algorithm │ Faithful │ Bits │ Passes/Second │
 ├───────┼────────────────┼──────────┼──────────────────────────────┼────────┼──────────┼─────────┼───────────┼──────────┼──────┼───────────────┤
 │   1   │ nim            │ 3        │ GordonBGood_extreme-hybrid   │ 43730  │ 5.00014  │    1    │   base    │   yes    │ 1    │  8745.75303   │
-│   2   │ nim            │ 3        │ GordonBGood_extreme          │ 18327  │ 5.00002  │    1    │   base    │   yes    │ 1    │  3665.38708   │
-│   3   │ nim            │ 3        │ GordonBGood_stride8block-16K │ 11571  │ 5.00013  │    1    │   base    │   yes    │ 1    │  2314.13979   │
-│   4   │ nim            │ 3        │ GordonBGood_stride8          │ 11010  │ 5.00045  │    1    │   base    │   yes    │ 1    │  2201.80349   │
-│   5   │ nim            │ 3        │ GordonBGood_bittwiddle       │  8641  │ 5.00048  │    1    │   base    │   yes    │ 1    │  1728.03354   │
+│   2   │ nim            │ 3        │ GordonBGood_extreme          │ 18115  │ 5.00016  │    1    │   base    │   yes    │ 1    │  3622.88491   │
+│   3   │ nim            │ 3        │ GordonBGood_stride8block-16K │ 15389  │ 5.00014  │    1    │   base    │   yes    │ 1    │  3077.71376   │
+│   4   │ nim            │ 3        │ GordonBGood_stride8          │ 12196  │ 5.00025  │    1    │   base    │   yes    │ 1    │  2439.07929   │
+│   5   │ nim            │ 3        │ GordonBGood_bittwiddle       │  8480  │ 5.00022  │    1    │   base    │   yes    │ 1    │  1695.92701   │
 └───────┴────────────────┴──────────┴──────────────────────────────┴────────┴──────────┴─────────┴───────────┴──────────┴──────┴───────────────┘
 ```