Skip to content

Commit 387072e

Browse files
authored
Create vector.cu
1 parent 695ad65 commit 387072e

File tree

1 file changed

+72
-0
lines changed

1 file changed

+72
-0
lines changed

src/Matlab/vector.cu

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#include "cuda_runtime.h"
2+
#include "mex.h"
3+
/* Kernel to square elements of the array on the GPU */
4+
__global__ void square_elements(float *in, float *out, int N)
5+
{
6+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
7+
if (idx < N)
8+
out[idx] = in[idx] * in[idx];
9+
}
10+
/* Gateway function */
11+
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
12+
{
13+
int i, j, m, n;
14+
double *data1, *data2;
15+
float *data1f, *data2f;
16+
float *data1f_gpu, *data2f_gpu;
17+
mxClassID category;
18+
if (nrhs != nlhs)
19+
mexErrMsgTxt("The number of input and output arguments must be the same.");
20+
for (i = 0; i < nrhs; i++)
21+
{
22+
/* Find the dimensions of the data */
23+
m = mxGetM(prhs[i]);
24+
n = mxGetN(prhs[i]);
25+
/* Create an mxArray for the output data */
26+
plhs[i] = mxCreateDoubleMatrix(m, n, mxREAL);
27+
/* Create an input and output data array on the GPU*/
28+
cudaMalloc((void **)&data1f_gpu, sizeof(float) * m * n);
29+
cudaMalloc((void **)&data2f_gpu, sizeof(float) * m * n);
30+
/* Retrieve the input data */
31+
data1 = mxGetPr(prhs[i]);
32+
/* Check if the input array is single or double precision */
33+
category = mxGetClassID(prhs[i]);
34+
if (category == mxSINGLE_CLASS)
35+
{
36+
/* The input array is single precision, it can be sent directly to the card */
37+
cudaMemcpy(data1f_gpu, data1, sizeof(float) * m * n, cudaMemcpyHostToDevice);
38+
}
39+
if (category == mxDOUBLE_CLASS)
40+
{
41+
/* The input array is in double precision, it needs to be converted to floats before being sent to the card */
42+
data1f = (float *)mxMalloc(sizeof(float) * m * n);
43+
for (j = 0; j < m * n; j++)
44+
{
45+
data1f[j] = (float)data1[j];
46+
}
47+
cudaMemcpy(data1f_gpu, data1f, sizeof(float) * n * m, cudaMemcpyHostToDevice);
48+
}
49+
data2f = (float *)mxMalloc(sizeof(float) * m * n);
50+
/* Compute execution configuration using 128 threads per block */
51+
dim3 dimBlock(128);
52+
dim3 dimGrid((m * n) / dimBlock.x);
53+
if ((n * m) % 128 != 0)
54+
dimGrid.x += 1;
55+
/* Call function on GPU */
56+
square_elements<<<dimGrid, dimBlock>>>(data1f_gpu, data2f_gpu, n * m);
57+
/* Copy result back to host */
58+
cudaMemcpy(data2f, data2f_gpu, sizeof(float) * n * m, cudaMemcpyDeviceToHost);
59+
/* Create a pointer to the output data */
60+
data2 = mxGetPr(plhs[i]);
61+
/* Convert from single to double before returning */
62+
for (j = 0; j < m * n; j++)
63+
{
64+
data2[j] = (double)data2f[j];
65+
}
66+
/* Clean-up memory on device and host */
67+
mxFree(data1f);
68+
mxFree(data2f);
69+
cudaFree(data1f_gpu);
70+
cudaFree(data2f_gpu);
71+
}
72+
}

0 commit comments

Comments
 (0)