|
| 1 | +#include "cuda_runtime.h" |
| 2 | +#include "mex.h" |
| 3 | +/* Kernel to square elements of the array on the GPU */ |
| 4 | +__global__ void square_elements(float *in, float *out, int N) |
| 5 | +{ |
| 6 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| 7 | + if (idx < N) |
| 8 | + out[idx] = in[idx] * in[idx]; |
| 9 | +} |
| 10 | +/* Gateway function */ |
| 11 | +void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) |
| 12 | +{ |
| 13 | + int i, j, m, n; |
| 14 | + double *data1, *data2; |
| 15 | + float *data1f, *data2f; |
| 16 | + float *data1f_gpu, *data2f_gpu; |
| 17 | + mxClassID category; |
| 18 | + if (nrhs != nlhs) |
| 19 | + mexErrMsgTxt("The number of input and output arguments must be the same."); |
| 20 | + for (i = 0; i < nrhs; i++) |
| 21 | + { |
| 22 | + /* Find the dimensions of the data */ |
| 23 | + m = mxGetM(prhs[i]); |
| 24 | + n = mxGetN(prhs[i]); |
| 25 | + /* Create an mxArray for the output data */ |
| 26 | + plhs[i] = mxCreateDoubleMatrix(m, n, mxREAL); |
| 27 | + /* Create an input and output data array on the GPU*/ |
| 28 | + cudaMalloc((void **)&data1f_gpu, sizeof(float) * m * n); |
| 29 | + cudaMalloc((void **)&data2f_gpu, sizeof(float) * m * n); |
| 30 | + /* Retrieve the input data */ |
| 31 | + data1 = mxGetPr(prhs[i]); |
| 32 | + /* Check if the input array is single or double precision */ |
| 33 | + category = mxGetClassID(prhs[i]); |
| 34 | + if (category == mxSINGLE_CLASS) |
| 35 | + { |
| 36 | + /* The input array is single precision, it can be sent directly to the card */ |
| 37 | + cudaMemcpy(data1f_gpu, data1, sizeof(float) * m * n, cudaMemcpyHostToDevice); |
| 38 | + } |
| 39 | + if (category == mxDOUBLE_CLASS) |
| 40 | + { |
| 41 | + /* The input array is in double precision, it needs to be converted to floats before being sent to the card */ |
| 42 | + data1f = (float *)mxMalloc(sizeof(float) * m * n); |
| 43 | + for (j = 0; j < m * n; j++) |
| 44 | + { |
| 45 | + data1f[j] = (float)data1[j]; |
| 46 | + } |
| 47 | + cudaMemcpy(data1f_gpu, data1f, sizeof(float) * n * m, cudaMemcpyHostToDevice); |
| 48 | + } |
| 49 | + data2f = (float *)mxMalloc(sizeof(float) * m * n); |
| 50 | + /* Compute execution configuration using 128 threads per block */ |
| 51 | + dim3 dimBlock(128); |
| 52 | + dim3 dimGrid((m * n) / dimBlock.x); |
| 53 | + if ((n * m) % 128 != 0) |
| 54 | + dimGrid.x += 1; |
| 55 | + /* Call function on GPU */ |
| 56 | + square_elements<<<dimGrid, dimBlock>>>(data1f_gpu, data2f_gpu, n * m); |
| 57 | + /* Copy result back to host */ |
| 58 | + cudaMemcpy(data2f, data2f_gpu, sizeof(float) * n * m, cudaMemcpyDeviceToHost); |
| 59 | + /* Create a pointer to the output data */ |
| 60 | + data2 = mxGetPr(plhs[i]); |
| 61 | + /* Convert from single to double before returning */ |
| 62 | + for (j = 0; j < m * n; j++) |
| 63 | + { |
| 64 | + data2[j] = (double)data2f[j]; |
| 65 | + } |
| 66 | + /* Clean-up memory on device and host */ |
| 67 | + mxFree(data1f); |
| 68 | + mxFree(data2f); |
| 69 | + cudaFree(data1f_gpu); |
| 70 | + cudaFree(data2f_gpu); |
| 71 | + } |
| 72 | +} |
0 commit comments