bug fixes

annotifyapp · Jun 2, 2021 · 47ebfcc · 47ebfcc
1 parent ad94866
commit 47ebfcc
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 15 deletions.
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu
@@ -148,7 +148,6 @@ cudaError_t CutlassSgemmNN(
 /// Kernel to initialize a matrix with small integers.
 __global__ void InitializeMatrix_kernel(
   float *matrix,
-  int ldm,
   int rows,
   int columns,
   int seed = 0) {
@@ -157,7 +156,7 @@ __global__ void InitializeMatrix_kernel(
   int j = threadIdx.y + blockIdx.y * blockDim.y;
 
   if (i < rows && j < columns) {
-    int offset = i + j * ldm;
+    int offset = i + j * rows;
 
     // Generate arbitrary elements.
     int const k = 16807;
@@ -169,26 +168,26 @@ __global__ void InitializeMatrix_kernel(
 }
 
 /// Simple function to initialize a matrix to arbitrary small integers.
-cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) {
+cudaError_t InitializeMatrix(float *matrix, int rows, int columns, int seed = 0) {
 
   dim3 block(16, 16);
   dim3 grid(
     (rows + block.x - 1) / block.x,
     (columns + block.y - 1) / block.y
   );
 
-  InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);
+  InitializeMatrix_kernel<<< grid, block >>>(matrix, rows, columns, seed);
 
   return cudaGetLastError();
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Allocates device memory for a matrix then fills with arbitrary small integers.
-cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) {
+cudaError_t AllocateMatrix(float **matrix, int rows, int columns, int seed = 0) {
   cudaError_t result;
 
-  size_t sizeof_matrix = sizeof(float) * ldm * columns;
+  size_t sizeof_matrix = sizeof(float) * columns;
 
   // Allocate device memory.
   result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
@@ -209,7 +208,7 @@ cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int s
   }
 
   // Initialize matrix elements to arbitrary small integers.
-  result = InitializeMatrix(*matrix, ldm, rows, columns, seed);
+  result = InitializeMatrix(*matrix, rows, columns, seed);
 
   if (result != cudaSuccess) {
     std::cerr << "Failed to initialize matrix: "
@@ -304,28 +303,28 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
   // Allocate matrices in GPU device memory with arbitrary seeds.
   //
 
-  result = AllocateMatrix(&A, lda, M, K, 0);
+  result = AllocateMatrix(&A, M, K, 0);
 
   if (result !=  cudaSuccess) {
     return result;
   }
 
-  result = AllocateMatrix(&B, ldb, K, N, 17);
+  result = AllocateMatrix(&B, K, N, 17);
 
   if (result !=  cudaSuccess) {
     cudaFree(A);
     return result;
   }
 
-  result = AllocateMatrix(&C_cutlass, ldc, M, N, 101);
+  result = AllocateMatrix(&C_cutlass, M, N, 101);
 
   if (result != cudaSuccess) {
     cudaFree(A);
     cudaFree(B);
     return result;
   }
 
-  result = AllocateMatrix(&C_reference, ldc, M, N, 101);
+  result = AllocateMatrix(&C_reference, M, N, 101);
 
   if (result != cudaSuccess) {
     cudaFree(A);

diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -223,11 +223,14 @@ class LinearCombinationClamp {
     intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
 
     /// Clamping constant value
-    ElementCompute const kClamp =
-        ElementCompute((1U << (sizeof_bits<ElementOutput>::value - 1)) - 1);
+    ElementCompute const kClampMax =
+        ElementCompute(platform::numeric_limits<ElementOutput>::max());
+
+    ElementCompute const kClampMin =
+        ElementCompute(platform::numeric_limits<ElementOutput>::lowest());
 
-    intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1));
-    intermediate = min_accumulator(intermediate, kClamp);
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);
 
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;