Skip to content

Commit

Permalink
bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
mani-ananth committed Jun 2, 2021
1 parent ad94866 commit 47ebfcc
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 15 deletions.
21 changes: 10 additions & 11 deletions examples/00_basic_gemm/basic_gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ cudaError_t CutlassSgemmNN(
/// Kernel to initialize a matrix with small integers.
__global__ void InitializeMatrix_kernel(
float *matrix,
int ldm,
int rows,
int columns,
int seed = 0) {
Expand All @@ -157,7 +156,7 @@ __global__ void InitializeMatrix_kernel(
int j = threadIdx.y + blockIdx.y * blockDim.y;

if (i < rows && j < columns) {
int offset = i + j * ldm;
int offset = i + j * rows;

// Generate arbitrary elements.
int const k = 16807;
Expand All @@ -169,26 +168,26 @@ __global__ void InitializeMatrix_kernel(
}

/// Simple function to initialize a matrix to arbitrary small integers.
cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) {
cudaError_t InitializeMatrix(float *matrix, int rows, int columns, int seed = 0) {

dim3 block(16, 16);
dim3 grid(
(rows + block.x - 1) / block.x,
(columns + block.y - 1) / block.y
);

InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);
InitializeMatrix_kernel<<< grid, block >>>(matrix, rows, columns, seed);

return cudaGetLastError();
}

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Allocates device memory for a matrix then fills with arbitrary small integers.
cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) {
cudaError_t AllocateMatrix(float **matrix, int rows, int columns, int seed = 0) {
cudaError_t result;

size_t sizeof_matrix = sizeof(float) * ldm * columns;
size_t sizeof_matrix = sizeof(float) * columns;

// Allocate device memory.
result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
Expand All @@ -209,7 +208,7 @@ cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int s
}

// Initialize matrix elements to arbitrary small integers.
result = InitializeMatrix(*matrix, ldm, rows, columns, seed);
result = InitializeMatrix(*matrix, rows, columns, seed);

if (result != cudaSuccess) {
std::cerr << "Failed to initialize matrix: "
Expand Down Expand Up @@ -304,28 +303,28 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
// Allocate matrices in GPU device memory with arbitrary seeds.
//

result = AllocateMatrix(&A, lda, M, K, 0);
result = AllocateMatrix(&A, M, K, 0);

if (result != cudaSuccess) {
return result;
}

result = AllocateMatrix(&B, ldb, K, N, 17);
result = AllocateMatrix(&B, K, N, 17);

if (result != cudaSuccess) {
cudaFree(A);
return result;
}

result = AllocateMatrix(&C_cutlass, ldc, M, N, 101);
result = AllocateMatrix(&C_cutlass, M, N, 101);

if (result != cudaSuccess) {
cudaFree(A);
cudaFree(B);
return result;
}

result = AllocateMatrix(&C_reference, ldc, M, N, 101);
result = AllocateMatrix(&C_reference, M, N, 101);

if (result != cudaSuccess) {
cudaFree(A);
Expand Down
11 changes: 7 additions & 4 deletions include/cutlass/epilogue/thread/linear_combination_clamp.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,14 @@ class LinearCombinationClamp {
intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum

/// Clamping constant value
ElementCompute const kClamp =
ElementCompute((1U << (sizeof_bits<ElementOutput>::value - 1)) - 1);
ElementCompute const kClampMax =
ElementCompute(platform::numeric_limits<ElementOutput>::max());

ElementCompute const kClampMin =
ElementCompute(platform::numeric_limits<ElementOutput>::lowest());

intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1));
intermediate = min_accumulator(intermediate, kClamp);
intermediate = max_accumulator(intermediate, kClampMin);
intermediate = min_accumulator(intermediate, kClampMax);

// Convert to destination numeric type
NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
Expand Down

0 comments on commit 47ebfcc

Please sign in to comment.