openai
diff --git a/‎blocksparse/lstm.py‎
Lines changed: 1 addition & 1 deletion b/‎blocksparse/lstm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎blocksparse/norms.py‎
Lines changed: 6 additions & 5 deletions b/‎blocksparse/norms.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/bst_op.cc‎
Lines changed: 4 additions & 0 deletions b/‎src/bst_op.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/ew_op_gpu.h‎
Lines changed: 16 additions & 1 deletion b/‎src/ew_op_gpu.h‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎src/layer_norm_cn_op_gpu.cu‎
Lines changed: 1 addition & 1 deletion b/‎src/layer_norm_cn_op_gpu.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/layer_norm_nc_op_gpu.cu‎
Lines changed: 32 additions & 31 deletions b/‎src/layer_norm_nc_op_gpu.cu‎
Lines changed: 32 additions & 31 deletions
diff --git a/‎src/layer_norm_op.cc‎
Lines changed: 10 additions & 4 deletions b/‎src/layer_norm_op.cc‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎src/lstm_op_gpu.cu‎
Lines changed: 1 addition & 1 deletion b/‎src/lstm_op_gpu.cu‎
Lines changed: 1 addition & 1 deletion
@@ -62,7 +62,7 @@ def fused_lstm_gates_grad(op, ec, eh):
     # compute bias grad
     #db = ew_db_dzb_op(dh, op.inputs[2], op=BIASADD_OP)
     # db = bias_grad_op(dh, op.inputs[2])
-    db, _ = bias_grad_op(dh, op.inputs[2])
+    db, _ = bias_grad_op(dh, op.inputs[2], axis=1)
 
     return dc, dh, db
 
 
@@ -20,7 +20,7 @@
 
 
 
-def layer_norm(x, g, b, axis=1, segments=1, epsilon=1e-6, relu=False, bench=0, use_tf=False):
+def layer_norm(x, g, b, axis=1, segments=1, epsilon=1e-6, relu=False, atomics=True, bench=0, use_tf=False):
 
     dev = g.op.device.lower()
     if use_tf or not dev or "cpu" in dev:
@@ -51,7 +51,7 @@ def layer_norm(x, g, b, axis=1, segments=1, epsilon=1e-6, relu=False, bench=0, u
         if relu:
             y = tf.nn.relu(y)
     else:
-        y, m, v, _, _ = layer_norm_op(x, g, b, S=segments, axis=axis, epsilon=epsilon, relu=relu, bench=bench)
+        y, m, v, _, _ = layer_norm_op(x, g, b, S=segments, axis=axis, epsilon=epsilon, relu=relu, atomics=atomics, bench=bench)
 
     return y
 
@@ -61,8 +61,9 @@ def layer_norm_grad(op, dy, mean, rstd, p1, p2):
     epsilon = op.get_attr("epsilon")
     relu    = op.get_attr("relu")
     axis    = op.get_attr("axis")
+    atomics = op.get_attr("atomics")
     bench   = op.get_attr("bench")
-    dx, dg, db, _, _ = layer_norm_grad_op(dy, op.inputs[0], op.inputs[1], op.inputs[2], op.outputs[1], op.outputs[2], S=S, axis=axis, epsilon=epsilon, relu=relu, bench=bench)
+    dx, dg, db, _, _ = layer_norm_grad_op(dy, op.inputs[0], op.inputs[1], op.inputs[2], op.outputs[1], op.outputs[2], S=S, axis=axis, epsilon=epsilon, relu=relu, atomics=atomics, bench=bench)
     return dx, dg, db
 
 def batch_norm_inference(x, g, b, m, v, epsilon=1e-6):
@@ -168,8 +169,8 @@ def layer_norm_grad_test(dy, x, g, b, axis=1, segments=1, epsilon=1e-6, relu=Fal
 
         #print("x:%.2f, mean:%.2f, rstd:%.2f, xhat:%.2f, dy:%.2f\n" % (x[0,0], mean[0,0], xstdr[0,0], xhat[0,0], dy[0,0]));
 
-        dg[seg] = np.sum(dy[seg] * xhat, axis=1-axis)
-        db[seg] = np.sum(dy[seg],        axis=1-axis)
+        dg[seg] = np.sum(dy[seg] * xhat, axis=1-axis, keepdims=True)
+        db[seg] = np.sum(dy[seg],        axis=1-axis, keepdims=True)
         dy[seg] = dy[seg] * g[seg]
 
         sum1 = np.sum(xhat * dy[seg], axis=axis, keepdims=True)
 
@@ -223,6 +223,8 @@ class BlocksparseTransformerOp : public OpKernel {
 
     if (a.dtype() == DT_HALF)
     {
+      OP_REQUIRES(ctx, major_ >= 7, errors::InvalidArgument("Tensorcore GPU required"));
+
       const ehalf* a_ptr = (const ehalf*)a.tensor_data().data();
       const ehalf* b_ptr = (const ehalf*)b.tensor_data().data();
 
@@ -293,6 +295,8 @@ class BlocksparseTransformerOp : public OpKernel {
 
     if (a.dtype() == DT_HALF)
     {
+      OP_REQUIRES(ctx, major_ >= 7, errors::InvalidArgument("Tensorcore GPU required"));
+
       const ehalf* a_ptr = (const ehalf*)a.tensor_data().data();
       const ehalf* b_ptr = (const ehalf*)b.tensor_data().data();
             ehalf* c_ptr = (      ehalf*)c->tensor_data().data();
 
@@ -849,7 +849,20 @@ __device__ __forceinline__ void ew_set(bhalf8 &a, uint val) { a.x = a.y = a.z =
 __device__ __forceinline__ void ew_zero(vhalf  &a) { a.x = 0; }
 __device__ __forceinline__ void ew_zero(mhalf  &a) { a.x = 0; }
 
-
+// minimize catastrophic cancellation: https://en.wikipedia.org/wiki/Loss_of_significance
+// Probably unnecessary, but GPU supports it at no cost (when used sparingly)
+__device__ __forceinline__ float precise_sub(float a, float b)
+{
+    float r;
+    asm("{\n\t"
+        ".reg .f64 a, b, c;\n\t"
+        "cvt.f64.f32 a, %1;\n\t"
+        "cvt.f64.f32 b, %2;\n\t"
+        "sub.f64 c, a, b;\n\t"
+        "cvt.rn.f32.f64 %0, c;\n\t"
+        "}" : "=f"(r) : "f"(a), "f"(b));
+    return r;
+}
 
 __device__ __forceinline__ float _ex2_approx(float x)
 {
@@ -1139,6 +1152,8 @@ MATH_Z_XY(ew_mul,      _mul)
 MATH_Z_XY(ew_div,      _div)
 MATH_Z_XY(ew_maximum, fmaxf)
 MATH_Z_XY(ew_minimum, fminf)
+MATH_Z_XY(ew_precise_sub, precise_sub)
+
 
 MATH_Z_X(ew_abs,           fabsf)
 MATH_Z_X(ew_neg,            _neg)
 
@@ -136,7 +136,7 @@ __global__ void __launch_bounds__(256) layer_norm_moments2_CN(
         // rstd = 1/sqrt(var)
         mean1 *= rcpK;
         mean2 *= rcpK;
-        float rstd = rsqrtf((mean2 - ew_sqr(mean1)) + epsilon);
+        float rstd = rsqrtf(precise_sub(mean2, ew_sqr(mean1)) + epsilon);
         store(add_ptr_u(Mean, n), mean1);
         store(add_ptr_u(Rstd, n), rstd);
     }
 
@@ -37,52 +37,50 @@ __global__ void __launch_bounds__(THREADS) layer_norm_NC(
         v_mean1 = ew_add(v_mean1, x);
         v_mean2 = ew_add(v_mean2, ew_sqr(x));
     }
-    float2 mean;
-    mean.x = ew_sum(v_mean1) * rcpK;
-    mean.y = ew_sum(v_mean2) * rcpK;
+    float2 stats;
+    stats.x = ew_sum(v_mean1) * rcpK;
+    stats.y = ew_sum(v_mean2) * rcpK;
 
     // reduce within warp
     for (int i = 16; i > 0; i >>= 1)
-    {
-        mean.x += shfl_xor(mean.x, i);
-        mean.y += shfl_xor(mean.y, i);
-    }
+        stats = ew_warp_sum(stats, i);
+
     // if using more than 1 warp, further reduced with shared memory
     if (THREADS > 32)
     {
         __shared__ float2 Share[32];
 
         // first thread of each warp store to shared
         if ((tid & 31) == 0)
-            Share[tid/32] = mean;
+            Share[tid/32] = stats;
 
         __syncthreads();
 
         if (tid < 32)
         {
             // first warp loads all prior reductions
-            mean = Share[tid];
+            stats = Share[tid];
 
             // reduce within this first warp
             for (int i = THREADS/64; i > 0; i >>= 1)
-            {
-                mean.x += shfl_xor(mean.x, i);
-                mean.y += shfl_xor(mean.y, i);
-            }
-            // outputs final reduction to shared
-            Share[tid] = mean;
+                stats = ew_warp_sum(stats, i);
+
+            // final reduction to shared
+            Share[tid] = stats;
         }
         __syncthreads();
 
         // broadcast result to all threads
-        mean = Share[0];
+        stats = Share[0];
     }
     // var  = avg(x**2) - avg(x)**2
     // rstd = 1/sqrt(var)
-    float rstd = rsqrtf((mean.y - ew_sqr(mean.x)) + epsilon);
+    float mean = stats.x;
+    float rstd = rsqrtf(precise_sub(stats.y, ew_sqr(mean)) + epsilon);
+
     if (tid == 0)
     {
-        Mean[n] = mean.x;
+        Mean[n] = mean;
         Rstd[n] = rstd;
     }
 
@@ -94,7 +92,7 @@ __global__ void __launch_bounds__(THREADS) layer_norm_NC(
         V g = load(G, k);
         V b = load(B, k);
 
-        V xhat = ew_mul(ew_sub(x, mean.x), rstd);
+        V xhat = ew_mul(ew_sub(x, mean), rstd);
         V    y = ew_add(ew_mul(xhat, g), b);
 
         if (relu)
@@ -513,17 +511,17 @@ __global__ void layer_norm_segmented_nc(
             #pragma unroll 1
             for (int i = thread2/64; i > 0; i >>= 1)
                 stats = ew_warp_sum(stats, i);
+
             // final reduction to shared
             Share[tid] = stats;
         }
         __syncthreads();
         stats = Share[0];
     }
-
     // var  = avg(x**2) - avg(x)**2
     // rstd = 1/sqrt(var)
     float mean = stats.x;
-    float rstd = rsqrtf((stats.y - mean*mean) + epsilon);
+    float rstd = rsqrtf(precise_sub(stats.y, ew_sqr(mean)) + epsilon);
     if (tid == 0)
     {
         __stg(add_ptr_u(Mean, m), mean);
@@ -808,17 +806,20 @@ bool LayerNormSegmentedBackward_NC(CUstream stream, int SMs,
     const float* b,
     const float* mean,
     const float* rstd,
-    float epsilon, uint N, uint S, uint K, float rcpK, int relu)
+    float epsilon, uint N, uint S, uint K, float rcpK, int relu, int atomics)
 {
     uint gridK = CEIL_DIV(K, 32);
     uint gridN = 1;
-    uint blocksK = gridK * S;
-    while (gridN < (N>>3) && gridN * blocksK < 32*SMs) gridN += 1;
-    if (gridN * blocksK > 32*SMs && gridN > 1) gridN -= 1;
-    if (gridN > 1)
+    if (atomics)
     {
-        cuMemsetD32Async((CUdeviceptr)dg, 0, S*K, stream);
-        cuMemsetD32Async((CUdeviceptr)db, 0, S*K, stream);
+        uint blocksK = gridK * S;
+        while (gridN < (N>>3) && gridN * blocksK < 32*SMs) gridN += 1;
+        if (gridN * blocksK > 32*SMs && gridN > 1) gridN -= 1;
+        if (gridN > 1)
+        {
+            cuMemsetD32Async((CUdeviceptr)dg, 0, S*K, stream);
+            cuMemsetD32Async((CUdeviceptr)db, 0, S*K, stream);
+        }
     }
     layer_norm_segmented_dg_db_nc<T><<<dim3(gridN,gridK,S),32,0,stream>>>(dg, db, dy, x, g, b, mean, rstd, N, S*K, S*K*gridN, K, relu);
 
@@ -869,9 +870,9 @@ bool LayerNormSegmentedBackward_NC(CUstream stream, int SMs,
     }
     return true; // TODO
 }
-template bool LayerNormSegmentedBackward_NC<float,float4>(CUstream stream, int SMs, float* dx, float* dg, float* db, const float* dy, const float* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, uint N, uint S, uint K, float rcpK, int relu);
-template bool LayerNormSegmentedBackward_NC<ehalf,ehalf4>(CUstream stream, int SMs, ehalf* dx, float* dg, float* db, const ehalf* dy, const ehalf* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, uint N, uint S, uint K, float rcpK, int relu);
-template bool LayerNormSegmentedBackward_NC<bhalf,bhalf4>(CUstream stream, int SMs, bhalf* dx, float* dg, float* db, const bhalf* dy, const bhalf* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, uint N, uint S, uint K, float rcpK, int relu);
+template bool LayerNormSegmentedBackward_NC<float,float4>(CUstream stream, int SMs, float* dx, float* dg, float* db, const float* dy, const float* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, uint N, uint S, uint K, float rcpK, int relu, int atomics);
+template bool LayerNormSegmentedBackward_NC<ehalf,ehalf4>(CUstream stream, int SMs, ehalf* dx, float* dg, float* db, const ehalf* dy, const ehalf* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, uint N, uint S, uint K, float rcpK, int relu, int atomics);
+template bool LayerNormSegmentedBackward_NC<bhalf,bhalf4>(CUstream stream, int SMs, bhalf* dx, float* dg, float* db, const bhalf* dy, const bhalf* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, uint N, uint S, uint K, float rcpK, int relu, int atomics);
 
 
 #endif // GOOGLE_CUDA
@@ -35,6 +35,7 @@ REGISTER_OP("LayerNorm")
     .Attr("axis: int")
     .Attr("epsilon: float")
     .Attr("relu: bool")
+    .Attr("atomics: bool = true")
     .Attr("bench: int = 0")
     .SetShapeFn([](InferenceContext* ctx) {
 
@@ -80,6 +81,7 @@ class LayerNormOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("S",       &S_      ));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("relu",    &relu_   ));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("axis",    &axis_   ));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("atomics", &atomics_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("bench",   &bench_  ));
     repeat_ = bench_ ? bench_ : 1;
   }
@@ -104,6 +106,8 @@ class LayerNormOp : public OpKernel {
         N *= x.dim_size(i);
       }
     }
+    OP_REQUIRES(ctx, axis_ != 0 || (N & 3) == 0, errors::InvalidArgument("Sum of non-feature axis dims needs to be multiple of 4 for feature axis=0."));
+
     if (K_ == 0)
     {
       OP_REQUIRES(ctx, K == g.shape().num_elements(), errors::InvalidArgument("Bad Gain Shape"));
@@ -166,15 +170,15 @@ class LayerNormOp : public OpKernel {
   }
   float epsilon_, rcpK_;
   int S_, K_, axis_, SMs_, bench_, repeat_;
-  bool relu_;
+  bool relu_, atomics_;
 };
 REGISTER_KERNEL_BUILDER(Name("LayerNorm").Device(DEVICE_GPU).TypeConstraint<FLOAT>("T"), LayerNormOp<FLOAT,float,float4>);
 REGISTER_KERNEL_BUILDER(Name("LayerNorm").Device(DEVICE_GPU).TypeConstraint<EHALF>("T"), LayerNormOp<EHALF,ehalf,ehalf4>);
 REGISTER_KERNEL_BUILDER(Name("LayerNorm").Device(DEVICE_GPU).TypeConstraint<BHALF>("T"), LayerNormOp<BHALF,bhalf,bhalf4>);
 
 template <typename T, typename V> bool LayerNormBackward_NC(CUstream stream, int SMs, T* dx, float* dg, float* db, const T* dy, const T* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, int K, int N, float rcpK, int relu);
 template <typename T, typename V> bool LayerNormBackward_CN(CUstream stream, int SMs, T* dx, float* dg, float* db, float* sum1, float* sum2, const T* dy, const T* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, int K, int N, float rcpK, int relu);
-template <typename T, typename V> bool LayerNormSegmentedBackward_NC(CUstream stream, int SMs, T* dx, float* dg, float* db, const T* dy, const T* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, uint N, uint S, uint K, float rcpK, int relu);
+template <typename T, typename V> bool LayerNormSegmentedBackward_NC(CUstream stream, int SMs, T* dx, float* dg, float* db, const T* dy, const T* x, const float* g, const float* b, const float* mean, const float* rstd, float epsilon, uint N, uint S, uint K, float rcpK, int relu, int atomics);
 
 REGISTER_OP("LayerNormGrad")
     .Input("dy: T")
@@ -193,6 +197,7 @@ REGISTER_OP("LayerNormGrad")
     .Attr("axis: int")
     .Attr("epsilon: float")
     .Attr("relu: bool")
+    .Attr("atomics: bool = true")
     .Attr("bench: int = 0")
     .SetShapeFn([](InferenceContext* ctx) {
       ctx->set_output(0, ctx->input(1));
@@ -215,6 +220,7 @@ class LayerNormGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("S",       &S_      ));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("relu",    &relu_   ));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("axis",    &axis_   ));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("atomics", &atomics_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("bench",   &bench_  ));
     repeat_ = bench_ ? bench_ : 1;
   }
@@ -289,7 +295,7 @@ class LayerNormGradOp : public OpKernel {
       else
       {
         if (S_ > 1 || K_ <= 1024*8)
-          LayerNormSegmentedBackward_NC<V1,V4>(stream, SMs_, dx_ptr, dg_ptr, db_ptr, dy_ptr, x_ptr, g_ptr, b_ptr, mean_ptr, rstd_ptr, epsilon_, N, S_, K_, rcpK_, relu_);
+          LayerNormSegmentedBackward_NC<V1,V4>(stream, SMs_, dx_ptr, dg_ptr, db_ptr, dy_ptr, x_ptr, g_ptr, b_ptr, mean_ptr, rstd_ptr, epsilon_, N, S_, K_, rcpK_, relu_, atomics_);
         else
           LayerNormBackward_NC<V1,V4>(stream, SMs_, dx_ptr, dg_ptr, db_ptr, dy_ptr, x_ptr, g_ptr, b_ptr, mean_ptr, rstd_ptr, epsilon_, K_, N, rcpK_, relu_);
       }
@@ -298,7 +304,7 @@ class LayerNormGradOp : public OpKernel {
   }
   float epsilon_, rcpK_;
   int S_, K_, axis_, SMs_, bench_, repeat_;
-  bool relu_;
+  bool relu_, atomics_;
 };
 REGISTER_KERNEL_BUILDER(Name("LayerNormGrad").Device(DEVICE_GPU).TypeConstraint<FLOAT>("T"), LayerNormGradOp<FLOAT,float,float4>);
 REGISTER_KERNEL_BUILDER(Name("LayerNormGrad").Device(DEVICE_GPU).TypeConstraint<EHALF>("T"), LayerNormGradOp<EHALF,ehalf,ehalf4>);
 
@@ -617,7 +617,7 @@ __global__ void __launch_bounds__(THREADS) sparse_relu_forward(
     }
     // var = avg(x**2) - avg(x)**2
     // std = sqrt(var)
-    float std = sqrtf(mean.y - mean.x*mean.x);
+    float std = sqrtf(precise_sub(mean.y, mean.x*mean.x));
 
     // Norm/Gain/Bias
     X += offset;
Original file line number	Diff line number	Diff line change
`@@ -223,6 +223,8 @@ class BlocksparseTransformerOp : public OpKernel {`
`223`	`223`
`224`	`224`	`if (a.dtype() == DT_HALF)`
`225`	`225`	`{`
	`226`	`+ OP_REQUIRES(ctx, major_ >= 7, errors::InvalidArgument("Tensorcore GPU required"));`
	`227`	`+`
`226`	`228`	`const ehalf* a_ptr = (const ehalf*)a.tensor_data().data();`
`227`	`229`	`const ehalf* b_ptr = (const ehalf*)b.tensor_data().data();`
`228`	`230`
`@@ -293,6 +295,8 @@ class BlocksparseTransformerOp : public OpKernel {`
`293`	`295`
`294`	`296`	`if (a.dtype() == DT_HALF)`
`295`	`297`	`{`
	`298`	`+ OP_REQUIRES(ctx, major_ >= 7, errors::InvalidArgument("Tensorcore GPU required"));`
	`299`	`+`
`296`	`300`	`const ehalf* a_ptr = (const ehalf*)a.tensor_data().data();`
`297`	`301`	`const ehalf* b_ptr = (const ehalf*)b.tensor_data().data();`
`298`	`302`	`ehalf* c_ptr = ( ehalf*)c->tensor_data().data();`
Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,7 @@ __global__ void __launch_bounds__(256) layer_norm_moments2_CN(`
`136`	`136`	`// rstd = 1/sqrt(var)`
`137`	`137`	`mean1 *= rcpK;`
`138`	`138`	`mean2 *= rcpK;`
`139`		`- float rstd = rsqrtf((mean2 - ew_sqr(mean1)) + epsilon);`
	`139`	`+ float rstd = rsqrtf(precise_sub(mean2, ew_sqr(mean1)) + epsilon);`
`140`	`140`	`store(add_ptr_u(Mean, n), mean1);`
`141`	`141`	`store(add_ptr_u(Rstd, n), rstd);`
`142`	`142`	`}`
Original file line number	Diff line number	Diff line change
`@@ -617,7 +617,7 @@ __global__ void __launch_bounds__(THREADS) sparse_relu_forward(`
`617`	`617`	`}`
`618`	`618`	`// var = avg(x2) - avg(x)2`
`619`	`619`	`// std = sqrt(var)`
`620`		`- float std = sqrtf(mean.y - mean.x*mean.x);`
	`620`	`+ float std = sqrtf(precise_sub(mean.y, mean.x*mean.x));`
`621`	`621`
`622`	`622`	`// Norm/Gain/Bias`
`623`	`623`	`X += offset;`