relu backprop

CubeFlix · Jul 6, 2023 · c5c87d6 · c5c87d6
1 parent b2c4d81
commit c5c87d6
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 5 deletions.
diff --git a/matricies/Matrix.cpp b/matricies/Matrix.cpp
@@ -143,6 +143,13 @@ namespace Oliver {
 		}
 	}
 
+	void Matrix::heaviside(int device) {
+		cudaError_t cudaError = cudaHeaviside(this, device);
+		if (cudaError != cudaSuccess) {
+			throw CUDAException(cudaError);
+		}
+	}
+
 	void Matrix::addBias(Matrix* x, int device) {
 		// Ensure the dimensions match.
 		if (m_cols != x->cols()) {

diff --git a/matricies/Matrix.h b/matricies/Matrix.h
@@ -27,8 +27,8 @@ namespace Oliver {
 		Matrix(unsigned const int rows, unsigned const int cols);
 		~Matrix();
 
-		unsigned const int rows();
-		unsigned const int cols();
+		unsigned int rows();
+		unsigned int cols();
 		float* buf();
 
 		void add(Matrix *x, int device);
@@ -44,6 +44,7 @@ namespace Oliver {
 		void inv(int device);
 		void exp(int device);
 		void log(int device);
+		void heaviside(int device);
 		void addBias(Matrix* x, int device);
 		void max(float x, int device);
 

diff --git a/matricies/MatrixKernels.cu b/matricies/MatrixKernels.cu
@@ -758,6 +758,60 @@ namespace Oliver {
         return cudaStatus;
     }
 
+    // Matrix heaviside kernel.
+    __global__ void heavisideKernel(float* a, const unsigned int rows, const unsigned int cols) {
+        int rowIdx = threadIdx.y + blockIdx.y * blockDim.y;
+        int colIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if (rowIdx < rows && colIdx < cols) {
+            int i = colIdx + rowIdx * cols;
+            a[i] = step(0.0, a[i]);
+        }
+    }
+
+    // GPU matrix heaviside function.
+    cudaError_t cudaHeaviside(Matrix* a, int device) {
+        float* dev_a = 0;
+        size_t s = a->rows() * a->cols() * sizeof(float);
+        cudaError_t cudaStatus;
+
+        // Move data to the device.
+        cudaStatus = cudaSetDevice(device);
+        if (cudaStatus != cudaSuccess) {
+            goto Error;
+        }
+        cudaStatus = cudaAllocCopy(((void**)&dev_a), a->buf(), s);
+        if (cudaStatus != cudaSuccess) {
+            goto Error;
+        }
+
+        // Calculate the thread and block dimensions.
+        const dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
+        const dim3 blocksPerGrid((a->cols() + BLOCK_SIZE - 1) / BLOCK_SIZE, (a->rows() + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+        // Run the kernel.
+        heavisideKernel << <blocksPerGrid, threadsPerBlock >> > (dev_a, a->rows(), a->cols());
+
+        // Clean up.
+        cudaStatus = cudaGetLastError();
+        if (cudaStatus != cudaSuccess) {
+            goto Error;
+        }
+        cudaStatus = cudaDeviceSynchronize();
+        if (cudaStatus != cudaSuccess) {
+            goto Error;
+        }
+        cudaStatus = cudaMemcpy(a->buf(), dev_a, s, cudaMemcpyDeviceToHost);
+        if (cudaStatus != cudaSuccess) {
+            goto Error;
+        }
+
+    Error:
+        cudaFree(dev_a);
+
+        return cudaStatus;
+    }
+
     // Matrix add bias kernel.
     __global__ void addBiasKernel(float* a, const float* b, const unsigned int rows, const unsigned int cols) {
         int rowIdx = threadIdx.y + blockIdx.y * blockDim.y;

diff --git a/matricies/MatrixKernels.cuh b/matricies/MatrixKernels.cuh
@@ -34,6 +34,7 @@ namespace Oliver {
 	cudaError_t cudaInv(Matrix* a, int device);
 	cudaError_t cudaExp(Matrix* a, int device);
 	cudaError_t cudaLog(Matrix* a, int device);
+	cudaError_t cudaHeaviside(Matrix* a, int device);
 	cudaError_t cudaAddBias(Matrix* a, Matrix* b, int device);
 	cudaError_t cudaScalarMax(Matrix* a, float b, int device);
 	cudaError_t cudaTranspose(Matrix* a, Matrix* out, int device);

diff --git a/matricies/RELULayer.cpp b/matricies/RELULayer.cpp
@@ -67,10 +67,9 @@ namespace Oliver {
 			throw NetworkException("invalid input, previous gradient, and output gradient matrix sizes for layer");
 		}
 
-		// Calculate max(x, 0)/max(x, 0) * grad.
+		// Calculate heaviside(input) * grad.
 		memcpy(prevGrad->buf(), m_inputCache->buf(), m_inputCache->rows() * m_inputCache->cols() * sizeof(float));
-		prevGrad->max(0.0, device);
-		prevGrad->div(prevGrad, device);
+		prevGrad->heaviside(device);
 		prevGrad->mul(outputGrad, device);
 
 		delete m_inputCache;