Fixed randu and randn ranges for floating types

Kumar Aatish · umar456 · commit e59eb93c16b0 · 2017-05-22T12:48:59.000-04:00
Randu now generates random values in range [0, 1) for floating types.
Randn avoids Infs by making sure the Box-Muller transform is not given
0 as an input.
diff --git a/src/backend/cpu/kernel/random_engine.hpp b/src/backend/cpu/kernel/random_engine.hpp
@@ -20,10 +20,17 @@ namespace cpu
 namespace kernel
 {
     //Utils
-    static const float UINTMAXFLOAT = 4294967296.0f;
-    static const float UINTLMAXDOUBLE = (4294967296.0*4294967296.0);
     static const double PI_VAL = 3.1415926535897932384626433832795028841971693993751058209749445923078164;
 
+    //Conversion to floats adapted from Random123
+    #define UINTMAX 0xffffffff
+    #define FLT_FACTOR ((1.0f)/(UINTMAX + (1.0f)))
+    #define HALF_FLT_FACTOR ((0.5f)*FLT_FACTOR)
+
+    #define UINTLMAX 0xffffffffffffffff
+    #define DBL_FACTOR ((1.0)/(UINTLMAX + (1.0)))
+    #define HALF_DBL_FACTOR ((0.5)*DBL_FACTOR)
+
     template <typename T>
     T transform(uint *val, int index)
     {
@@ -76,15 +83,17 @@ namespace kernel
         return transform<uintl>(val, index);
     }
 
+    //Generates rationals in [0, 1)
     template <> float transform<float>(uint *val, int index)
     {
-        return (float)val[index]/UINTMAXFLOAT;
+        return 1.f - (val[index]*FLT_FACTOR + HALF_FLT_FACTOR);
     }
 
+    //Generates rationals in [0, 1)
     template <> double transform<double>(uint *val, int index)
     {
         uintl v = transform<uintl>(val, index);
-        return (double)v/UINTLMAXDOUBLE;
+        return 1.0 - (v*DBL_FACTOR + HALF_DBL_FACTOR);
     }
 
     template <typename T>
@@ -131,8 +140,8 @@ namespace kernel
         /*
          * The log of a real value x where 0 < x < 1 is negative.
          */
-        T r = sqrt((T)(-2.0) * log(r1));
-        T theta = 2 * (T)PI_VAL * (r2);
+        T r = sqrt((T)(-2.0) * log((T)(1.0) - r1));
+        T theta = 2 * (T)PI_VAL * ((T)(1.0) - r2);
         *out1 = r*sin(theta);
         *out2 = r*cos(theta);
     }
diff --git a/src/backend/cuda/kernel/random_engine.hpp b/src/backend/cuda/kernel/random_engine.hpp
@@ -25,19 +25,28 @@ namespace kernel
     //Utils
 
     static const int THREADS = 256;
-    #define UINTMAXFLOAT 4294967296.0f
-    #define UINTLMAXDOUBLE (4294967296.0*4294967296.0)
     #define PI_VAL 3.1415926535897932384626433832795028841971693993751058209749445923078164
 
+    //Conversion to floats adapted from Random123
+    #define UINTMAX 0xffffffff
+    #define FLT_FACTOR ((1.0f)/(UINTMAX + (1.0f)))
+    #define HALF_FLT_FACTOR ((0.5f)*FLT_FACTOR)
+
+    #define UINTLMAX 0xffffffffffffffff
+    #define DBL_FACTOR ((1.0)/(UINTLMAX + (1.0)))
+    #define HALF_DBL_FACTOR ((0.5)*DBL_FACTOR)
+
+    //Generates rationals in (0, 1]
     __device__ static float getFloat(const uint &num)
     {
-        return float(num)/UINTMAXFLOAT;
+        return (num*FLT_FACTOR + HALF_FLT_FACTOR);
     }
 
+    //Generates rationals in (0, 1]
     __device__ static double getDouble(const uint &num1, const uint &num2)
     {
         uintl num = (((uintl)num1)<<32) | ((uintl)num2);
-        return double(num)/UINTLMAXDOUBLE;
+        return (num*DBL_FACTOR + HALF_DBL_FACTOR);
     }
 
     template <typename T>
@@ -150,33 +159,33 @@ namespace kernel
     __device__ static void writeOut128Bytes(float *out, const uint &index,
             const uint &r1, const uint &r2, const uint &r3, const uint &r4)
     {
-        out[index]                = getFloat(r1);
-        out[index +   blockDim.x] = getFloat(r2);
-        out[index + 2*blockDim.x] = getFloat(r3);
-        out[index + 3*blockDim.x] = getFloat(r4);
+        out[index]                = 1.f - getFloat(r1);
+        out[index +   blockDim.x] = 1.f - getFloat(r2);
+        out[index + 2*blockDim.x] = 1.f - getFloat(r3);
+        out[index + 3*blockDim.x] = 1.f - getFloat(r4);
     }
 
     __device__ static void writeOut128Bytes(cfloat *out, const uint &index,
             const uint &r1, const uint &r2, const uint &r3, const uint &r4)
     {
-        out[index].x              = getFloat(r1);
-        out[index].y              = getFloat(r2);
-        out[index + blockDim.x].x = getFloat(r3);
-        out[index + blockDim.x].y = getFloat(r4);
+        out[index].x              = 1.f - getFloat(r1);
+        out[index].y              = 1.f - getFloat(r2);
+        out[index + blockDim.x].x = 1.f - getFloat(r3);
+        out[index + blockDim.x].y = 1.f - getFloat(r4);
     }
 
     __device__ static void writeOut128Bytes(double *out, const uint &index,
             const uint &r1, const uint &r2, const uint &r3, const uint &r4)
     {
-        out[index]              = getDouble(r1, r2);
-        out[index + blockDim.x] = getDouble(r3, r4);
+        out[index]              = 1.0 - getDouble(r1, r2);
+        out[index + blockDim.x] = 1.0 - getDouble(r3, r4);
     }
 
     __device__ static void writeOut128Bytes(cdouble *out, const uint &index,
             const uint &r1, const uint &r2, const uint &r3, const uint &r4)
     {
-        out[index].x = getDouble(r1, r2);
-        out[index].y = getDouble(r3, r4);
+        out[index].x = 1.0 - getDouble(r1, r2);
+        out[index].y = 1.0 - getDouble(r3, r4);
     }
 
     //Normalized writes without boundary checking
@@ -305,38 +314,38 @@ namespace kernel
     __device__ static void partialWriteOut128Bytes(float *out, const uint &index,
             const uint &r1, const uint &r2, const uint &r3, const uint &r4, const uint &elements)
     {
-        if (index                < elements) {out[index]                = getFloat(r1);}
-        if (index +   blockDim.x < elements) {out[index +   blockDim.x] = getFloat(r2);}
-        if (index + 2*blockDim.x < elements) {out[index + 2*blockDim.x] = getFloat(r3);}
-        if (index + 3*blockDim.x < elements) {out[index + 3*blockDim.x] = getFloat(r4);}
+        if (index                < elements) {out[index]                = 1.f - getFloat(r1);}
+        if (index +   blockDim.x < elements) {out[index +   blockDim.x] = 1.f - getFloat(r2);}
+        if (index + 2*blockDim.x < elements) {out[index + 2*blockDim.x] = 1.f - getFloat(r3);}
+        if (index + 3*blockDim.x < elements) {out[index + 3*blockDim.x] = 1.f - getFloat(r4);}
     }
 
     __device__ static void partialWriteOut128Bytes(cfloat *out, const uint &index,
             const uint &r1, const uint &r2, const uint &r3, const uint &r4, const uint &elements)
     {
         if (index              < elements) {
-            out[index].x              = getFloat(r1);
-            out[index].y              = getFloat(r2);
+            out[index].x              = 1.f - getFloat(r1);
+            out[index].y              = 1.f - getFloat(r2);
         }
         if (index + blockDim.x < elements) {
-            out[index + blockDim.x].x = getFloat(r3);
-            out[index + blockDim.x].y = getFloat(r4);
+            out[index + blockDim.x].x = 1.f - getFloat(r3);
+            out[index + blockDim.x].y = 1.f - getFloat(r4);
         }
     }
 
     __device__ static void partialWriteOut128Bytes(double *out, const uint &index,
             const uint &r1, const uint &r2, const uint &r3, const uint &r4, const uint &elements)
     {
-        if (index              < elements) {out[index]              = getDouble(r1, r2);}
-        if (index + blockDim.x < elements) {out[index + blockDim.x] = getDouble(r3, r4);}
+        if (index              < elements) {out[index]              = 1.0 - getDouble(r1, r2);}
+        if (index + blockDim.x < elements) {out[index + blockDim.x] = 1.0 - getDouble(r3, r4);}
     }
 
     __device__ static void partialWriteOut128Bytes(cdouble *out, const uint &index,
             const uint &r1, const uint &r2, const uint &r3, const uint &r4, const uint &elements)
     {
         if (index < elements) {
-            out[index].x = getDouble(r1, r2);
-            out[index].y = getDouble(r3, r4);
+            out[index].x = 1.0 - getDouble(r1, r2);
+            out[index].y = 1.0 - getDouble(r3, r4);
         }
     }
 
diff --git a/src/backend/opencl/kernel/random_engine_write.cl b/src/backend/opencl/kernel/random_engine_write.cl
@@ -7,13 +7,17 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#define UINTMAXFLOAT 4294967296.0f
-#define UINTLMAXDOUBLE (4294967296.0*4294967296.0)
 #define PI_VAL 3.1415926535897932384626433832795028841971693993751058209749445923078164
 
+//Conversion to floats adapted from Random123
+#define UINTMAX 0xffffffff
+#define FLT_FACTOR ((1.0f)/(UINTMAX + (1.0f)))
+#define HALF_FLT_FACTOR ((0.5f)*FLT_FACTOR)
+
+//Generates rationals in (0, 1]
 float getFloat(const uint * const num)
 {
-    return ((float)(*num))/UINTMAXFLOAT;
+    return ((*num)*FLT_FACTOR + HALF_FLT_FACTOR);
 }
 
 //Writes without boundary checking
@@ -129,10 +133,10 @@ void writeOut128Bytes_ulong(__global ulong *out, const uint * const index,
 void writeOut128Bytes_float(__global float *out, const uint * const index,
         const uint * const r1, const uint * const r2, const uint * const r3, const uint * const r4)
 {
-    out[*index]             = getFloat(r1);
-    out[*index +   THREADS] = getFloat(r2);
-    out[*index + 2*THREADS] = getFloat(r3);
-    out[*index + 3*THREADS] = getFloat(r4);
+    out[*index]             = 1.f - getFloat(r1);
+    out[*index +   THREADS] = 1.f - getFloat(r2);
+    out[*index + 2*THREADS] = 1.f - getFloat(r3);
+    out[*index + 3*THREADS] = 1.f - getFloat(r4);
 }
 
 
@@ -252,10 +256,10 @@ void partialWriteOut128Bytes_ulong(__global ulong *out, const uint * const index
 void partialWriteOut128Bytes_float(__global float *out, const uint * const index,
         const uint * const r1, const uint * const r2, const uint * const r3, const uint * const r4, const uint * const elements)
 {
-    if (*index             < *elements) {out[*index]             = getFloat(r1);}
-    if (*index +   THREADS < *elements) {out[*index +   THREADS] = getFloat(r2);}
-    if (*index + 2*THREADS < *elements) {out[*index + 2*THREADS] = getFloat(r3);}
-    if (*index + 3*THREADS < *elements) {out[*index + 3*THREADS] = getFloat(r4);}
+    if (*index             < *elements) {out[*index]             = 1.f - getFloat(r1);}
+    if (*index +   THREADS < *elements) {out[*index +   THREADS] = 1.f - getFloat(r2);}
+    if (*index + 2*THREADS < *elements) {out[*index + 2*THREADS] = 1.f - getFloat(r3);}
+    if (*index + 3*THREADS < *elements) {out[*index + 3*THREADS] = 1.f - getFloat(r4);}
 }
 
 #if RAND_DIST == 1
@@ -302,24 +306,31 @@ void partialBoxMullerWriteOut128Bytes_float(__global float *out, const uint * co
 #endif
 
 #ifdef USE_DOUBLE
+
+//Conversion to floats adapted from Random123
+#define UINTLMAX 0xffffffffffffffff
+#define DBL_FACTOR ((1.0)/(UINTLMAX + (1.0)))
+#define HALF_DBL_FACTOR ((0.5)*DBL_FACTOR)
+
+//Generates rationals in (0, 1]
 double getDouble(const uint * const num1, const uint * const num2)
 {
     ulong num = (((ulong)*num1)<<32) | ((ulong)*num2);
-    return ((double)num)/UINTLMAXDOUBLE;
+    return (num*DBL_FACTOR + HALF_DBL_FACTOR);
 }
 
 void writeOut128Bytes_double(__global double *out, const uint * const index,
         const uint * const r1, const uint * const r2, const uint * const r3, const uint * const r4)
 {
-    out[*index]           = getDouble(r1, r2);
-    out[*index + THREADS] = getDouble(r3, r4);
+    out[*index]           = 1.0 - getDouble(r1, r2);
+    out[*index + THREADS] = 1.0 - getDouble(r3, r4);
 }
 
 void partialWriteOut128Bytes_double(__global double *out, const uint * const index,
         const uint * const r1, const uint * const r2, const uint * const r3, const uint * const r4, const uint * const elements)
 {
-    if (*index           < *elements) {out[*index]           = getDouble(r1, r2);}
-    if (*index + THREADS < *elements) {out[*index + THREADS] = getDouble(r3, r4);}
+    if (*index           < *elements) {out[*index]           = 1.0 - getDouble(r1, r2);}
+    if (*index + THREADS < *elements) {out[*index + THREADS] = 1.0 - getDouble(r3, r4);}
 }
 
 #if RAND_DIST == 1

Original file line number	Diff line number	Diff line change
`@@ -25,19 +25,28 @@ namespace kernel`
`25`	`25`	`//Utils`
`26`	`26`
`27`	`27`	`static const int THREADS = 256;`
`28`		`- #define UINTMAXFLOAT 4294967296.0f`
`29`		`- #define UINTLMAXDOUBLE (4294967296.0*4294967296.0)`
`30`	`28`	`#define PI_VAL 3.1415926535897932384626433832795028841971693993751058209749445923078164`
`31`	`29`
	`30`	`+ //Conversion to floats adapted from Random123`
	`31`	`+ #define UINTMAX 0xffffffff`
	`32`	`+ #define FLT_FACTOR ((1.0f)/(UINTMAX + (1.0f)))`
	`33`	`+ #define HALF_FLT_FACTOR ((0.5f)*FLT_FACTOR)`
	`34`	`+`
	`35`	`+ #define UINTLMAX 0xffffffffffffffff`
	`36`	`+ #define DBL_FACTOR ((1.0)/(UINTLMAX + (1.0)))`
	`37`	`+ #define HALF_DBL_FACTOR ((0.5)*DBL_FACTOR)`
	`38`	`+`
	`39`	`+ //Generates rationals in (0, 1]`
`32`	`40`	`__device__ static float getFloat(const uint &num)`
`33`	`41`	`{`
`34`		`- return float(num)/UINTMAXFLOAT;`
	`42`	`+ return (num*FLT_FACTOR + HALF_FLT_FACTOR);`
`35`	`43`	`}`
`36`	`44`
	`45`	`+ //Generates rationals in (0, 1]`
`37`	`46`	`__device__ static double getDouble(const uint &num1, const uint &num2)`
`38`	`47`	`{`
`39`	`48`	`uintl num = (((uintl)num1)<<32) \| ((uintl)num2);`
`40`		`- return double(num)/UINTLMAXDOUBLE;`
	`49`	`+ return (num*DBL_FACTOR + HALF_DBL_FACTOR);`
`41`	`50`	`}`
`42`	`51`
`43`	`52`	`template <typename T>`
`@@ -150,33 +159,33 @@ namespace kernel`
`150`	`159`	`__device__ static void writeOut128Bytes(float *out, const uint &index,`
`151`	`160`	`const uint &r1, const uint &r2, const uint &r3, const uint &r4)`
`152`	`161`	`{`
`153`		`- out[index] = getFloat(r1);`
`154`		`- out[index + blockDim.x] = getFloat(r2);`
`155`		`- out[index + 2*blockDim.x] = getFloat(r3);`
`156`		`- out[index + 3*blockDim.x] = getFloat(r4);`
	`162`	`+ out[index] = 1.f - getFloat(r1);`
	`163`	`+ out[index + blockDim.x] = 1.f - getFloat(r2);`
	`164`	`+ out[index + 2*blockDim.x] = 1.f - getFloat(r3);`
	`165`	`+ out[index + 3*blockDim.x] = 1.f - getFloat(r4);`
`157`	`166`	`}`
`158`	`167`
`159`	`168`	`__device__ static void writeOut128Bytes(cfloat *out, const uint &index,`
`160`	`169`	`const uint &r1, const uint &r2, const uint &r3, const uint &r4)`
`161`	`170`	`{`
`162`		`- out[index].x = getFloat(r1);`
`163`		`- out[index].y = getFloat(r2);`
`164`		`- out[index + blockDim.x].x = getFloat(r3);`
`165`		`- out[index + blockDim.x].y = getFloat(r4);`
	`171`	`+ out[index].x = 1.f - getFloat(r1);`
	`172`	`+ out[index].y = 1.f - getFloat(r2);`
	`173`	`+ out[index + blockDim.x].x = 1.f - getFloat(r3);`
	`174`	`+ out[index + blockDim.x].y = 1.f - getFloat(r4);`
`166`	`175`	`}`
`167`	`176`
`168`	`177`	`__device__ static void writeOut128Bytes(double *out, const uint &index,`
`169`	`178`	`const uint &r1, const uint &r2, const uint &r3, const uint &r4)`
`170`	`179`	`{`
`171`		`- out[index] = getDouble(r1, r2);`
`172`		`- out[index + blockDim.x] = getDouble(r3, r4);`
	`180`	`+ out[index] = 1.0 - getDouble(r1, r2);`
	`181`	`+ out[index + blockDim.x] = 1.0 - getDouble(r3, r4);`
`173`	`182`	`}`
`174`	`183`
`175`	`184`	`__device__ static void writeOut128Bytes(cdouble *out, const uint &index,`
`176`	`185`	`const uint &r1, const uint &r2, const uint &r3, const uint &r4)`
`177`	`186`	`{`
`178`		`- out[index].x = getDouble(r1, r2);`
`179`		`- out[index].y = getDouble(r3, r4);`
	`187`	`+ out[index].x = 1.0 - getDouble(r1, r2);`
	`188`	`+ out[index].y = 1.0 - getDouble(r3, r4);`
`180`	`189`	`}`
`181`	`190`
`182`	`191`	`//Normalized writes without boundary checking`
`@@ -305,38 +314,38 @@ namespace kernel`
`305`	`314`	`__device__ static void partialWriteOut128Bytes(float *out, const uint &index,`
`306`	`315`	`const uint &r1, const uint &r2, const uint &r3, const uint &r4, const uint &elements)`
`307`	`316`	`{`
`308`		`- if (index < elements) {out[index] = getFloat(r1);}`
`309`		`- if (index + blockDim.x < elements) {out[index + blockDim.x] = getFloat(r2);}`
`310`		`- if (index + 2blockDim.x < elements) {out[index + 2blockDim.x] = getFloat(r3);}`
`311`		`- if (index + 3blockDim.x < elements) {out[index + 3blockDim.x] = getFloat(r4);}`
	`317`	`+ if (index < elements) {out[index] = 1.f - getFloat(r1);}`
	`318`	`+ if (index + blockDim.x < elements) {out[index + blockDim.x] = 1.f - getFloat(r2);}`
	`319`	`+ if (index + 2blockDim.x < elements) {out[index + 2blockDim.x] = 1.f - getFloat(r3);}`
	`320`	`+ if (index + 3blockDim.x < elements) {out[index + 3blockDim.x] = 1.f - getFloat(r4);}`
`312`	`321`	`}`
`313`	`322`
`314`	`323`	`__device__ static void partialWriteOut128Bytes(cfloat *out, const uint &index,`
`315`	`324`	`const uint &r1, const uint &r2, const uint &r3, const uint &r4, const uint &elements)`
`316`	`325`	`{`
`317`	`326`	`if (index < elements) {`
`318`		`- out[index].x = getFloat(r1);`
`319`		`- out[index].y = getFloat(r2);`
	`327`	`+ out[index].x = 1.f - getFloat(r1);`
	`328`	`+ out[index].y = 1.f - getFloat(r2);`
`320`	`329`	`}`
`321`	`330`	`if (index + blockDim.x < elements) {`
`322`		`- out[index + blockDim.x].x = getFloat(r3);`
`323`		`- out[index + blockDim.x].y = getFloat(r4);`
	`331`	`+ out[index + blockDim.x].x = 1.f - getFloat(r3);`
	`332`	`+ out[index + blockDim.x].y = 1.f - getFloat(r4);`
`324`	`333`	`}`
`325`	`334`	`}`
`326`	`335`
`327`	`336`	`__device__ static void partialWriteOut128Bytes(double *out, const uint &index,`
`328`	`337`	`const uint &r1, const uint &r2, const uint &r3, const uint &r4, const uint &elements)`
`329`	`338`	`{`
`330`		`- if (index < elements) {out[index] = getDouble(r1, r2);}`
`331`		`- if (index + blockDim.x < elements) {out[index + blockDim.x] = getDouble(r3, r4);}`
	`339`	`+ if (index < elements) {out[index] = 1.0 - getDouble(r1, r2);}`
	`340`	`+ if (index + blockDim.x < elements) {out[index + blockDim.x] = 1.0 - getDouble(r3, r4);}`
`332`	`341`	`}`
`333`	`342`
`334`	`343`	`__device__ static void partialWriteOut128Bytes(cdouble *out, const uint &index,`
`335`	`344`	`const uint &r1, const uint &r2, const uint &r3, const uint &r4, const uint &elements)`
`336`	`345`	`{`
`337`	`346`	`if (index < elements) {`
`338`		`- out[index].x = getDouble(r1, r2);`
`339`		`- out[index].y = getDouble(r3, r4);`
	`347`	`+ out[index].x = 1.0 - getDouble(r1, r2);`
	`348`	`+ out[index].y = 1.0 - getDouble(r3, r4);`
`340`	`349`	`}`
`341`	`350`	`}`
`342`	`351`