1+ #include " cuda_runtime.h"
2+ #include " device_launch_parameters.h"
3+
4+ #include " DS_timer.h"
5+ #include < stdio.h>
6+ #include < stdlib.h>
7+ #include < string.h>
8+
9+ #define DO_CPU
10+ #define DATA_TYEP int
11+
12+ #define SIZE_M (512 *2 )
13+ #define SIZE_N (512 *4 )
14+ #define SIZE_K (512 *2 )
15+
16+ #define INDEX2ROW (_index,_width ) (int )((_index)/(_width))
17+ #define INDEX2COL (_index,_width ) ((_index)%(_width))
18+ #define ID2INDEX (_row,_col, _width ) (((_row)*(_width))+(_col))
19+
20+ #define BLOCK_SIZE 16
21+
22+ // Macro function
23+ // #define KERNEL_MUL(_a,_b) __fmul_rn(_a,_b)
24+ #define KERNEL_MUL (_a,_b ) (_a*_b)
25+
26+ // kernel declarations
27+ __global__ void MatMul (DATA_TYEP* matA, DATA_TYEP* matB, DATA_TYEP* matC, int m, int n, int k);
28+
29+ template <class T > void allocNinitMem (T** p, long long size, double * memUsage = NULL );
30+ bool compareMatrix (DATA_TYEP* _A, DATA_TYEP* _B, int _size);
31+
32+ int main (int argc, char * argv[])
33+ {
34+ DS_timer timer (10 );
35+ timer.setTimerName (0 , (char *)" CPU algorithm" );
36+ timer.setTimerName (1 , (char *)" GPU/CUDA algorithm" );
37+ timer.setTimerName (2 , (char *)" - Kernel" );
38+ timer.setTimerName (4 , (char *)" - [Data transter] host->device" );
39+ timer.setTimerName (5 , (char *)" - [Data transfer] device->host" );
40+
41+ // set matrix size
42+ int m, n, k;
43+
44+ if (argc < 3 ) { m = SIZE_M; n = SIZE_N; k = SIZE_K; }
45+ else { m = atoi (argv[1 ]); n = atoi (argv[2 ]); k = atoi (argv[3 ]); }
46+
47+ printf (" Size : A = (%d by %d), B = (%d by %d), C = (%d by %d)\n " , m, k, k, n, m, n);
48+
49+ int sizeA = m * k;
50+ int sizeB = k * n;
51+ int sizeC = m * n;
52+
53+ // Make matrix
54+ DATA_TYEP* A = NULL , * B = NULL ;
55+ allocNinitMem<DATA_TYEP>(&A, sizeA);
56+ allocNinitMem<DATA_TYEP>(&B, sizeB);
57+
58+ DATA_TYEP* Ccpu = NULL , * Cgpu = NULL ;
59+ allocNinitMem<DATA_TYEP>(&Ccpu, sizeC);
60+ allocNinitMem<DATA_TYEP>(&Cgpu, sizeC);
61+
62+ // generate input matrices
63+ for (int i = 0 ; i < sizeA; i++) A[i] = ((rand () % 10 ) + ((rand () % 100 ) / 100.0 ));
64+ for (int i = 0 ; i < sizeB; i++) B[i] = ((rand () % 10 ) + ((rand () % 100 ) / 100.0 ));
65+
66+ #ifdef DO_CPU // CPU version (OpenMP)
67+ timer.onTimer (0 );
68+ #pragma omp parallel for num_threads(4)
69+ for (int row = 0 ; row < m; row++) {
70+ for (int col = 0 ; col < n; col++) {
71+ int cIndex = ID2INDEX (row, col, n);
72+ Ccpu[cIndex] = 0 ;
73+ for (int i = 0 ; i < k; i++)
74+ Ccpu[cIndex] += (A[ID2INDEX (row, i, k)] * B[ID2INDEX (i, col, n)]);
75+ }
76+ }
77+ printf (" CPU finished!\n " );
78+ timer.offTimer (0 );
79+ #endif
80+
81+ // GPU setup
82+ DATA_TYEP* dA, * dB, * dC;
83+
84+ cudaMalloc (&dA, sizeA * sizeof (DATA_TYEP));
85+ cudaMemset (dA, 0 , sizeA * sizeof (DATA_TYEP));
86+
87+ cudaMalloc (&dB, sizeB * sizeof (DATA_TYEP));
88+ cudaMemset (dB, 0 , sizeB * sizeof (DATA_TYEP));
89+
90+ cudaMalloc (&dC, sizeC * sizeof (DATA_TYEP));
91+ cudaMemset (dC, 0 , sizeC * sizeof (DATA_TYEP));
92+
93+ timer.onTimer (1 );
94+
95+ timer.onTimer (4 );
96+ cudaMemcpy (dA, A, sizeA * sizeof (DATA_TYEP), cudaMemcpyHostToDevice);
97+ cudaMemcpy (dB, B, sizeB * sizeof (DATA_TYEP), cudaMemcpyHostToDevice);
98+ timer.offTimer (4 );
99+
100+ dim3 gridDim (ceil ((float )m / BLOCK_SIZE), ceil ((float )n / BLOCK_SIZE));
101+ dim3 blockDim (BLOCK_SIZE, BLOCK_SIZE);
102+
103+ printf (" Grid(%d, %d), Block(%d, %d)\n " , gridDim .x , gridDim .y , blockDim .x , blockDim .y );
104+
105+ // GPU version
106+ timer.onTimer (2 );
107+ MatMul << < gridDim , blockDim >> > (dA, dB, dC, m, n, k);
108+ cudaDeviceSynchronize ();
109+ timer.offTimer (2 );
110+
111+ timer.onTimer (5 );
112+ cudaMemcpy (Cgpu, dC, sizeC * sizeof (DATA_TYEP), cudaMemcpyDeviceToHost);
113+ timer.offTimer (5 );
114+
115+ timer.offTimer (1 );
116+
117+ cudaFree (dA);
118+ cudaFree (dB);
119+ cudaFree (dC);
120+
121+ #ifdef DO_CPU
122+ printf (" [Kernel basic] " );
123+ compareMatrix (Ccpu, Cgpu, sizeC);
124+ #endif
125+
126+ timer.printTimer (1 );
127+
128+ delete A;
129+ delete B;
130+ delete Ccpu;
131+ delete Cgpu;
132+
133+ return 0 ;
134+ }
135+
136+ bool compareMatrix (DATA_TYEP* _A, DATA_TYEP* _B, int _size)
137+ {
138+ bool isMatched = true ;
139+ for (int i = 0 ; i < _size; i++) {
140+ if (_A[i] != _B[i]) {
141+ printf (" [%d] not matched! (%f, %f)\n " , i, _A[i], _B[i]);
142+ getchar ();
143+ isMatched = false ;
144+ }
145+ }
146+ if (isMatched)
147+ printf (" Results are matched!\n " );
148+ else
149+ printf (" Results are not matched!!!!!!!!!!!\n " );
150+
151+ return isMatched;
152+ }
153+
154+ __global__ void MatMul (DATA_TYEP* matA, DATA_TYEP* matB, DATA_TYEP* matC, int m, int n, int k)
155+ {
156+ int row = blockDim .x * blockIdx .x + threadIdx .x ;
157+ int col = blockDim .y * blockIdx .y + threadIdx .y ;
158+
159+ if (row >= m || col >= n)
160+ return ;
161+
162+ DATA_TYEP val = 0 ; // hope to use register
163+ for (int i = 0 ; i < k; i++)
164+ val += KERNEL_MUL (matA[ID2INDEX (row, i, k)], matB[ID2INDEX (i, col, n)]);
165+
166+ matC[ID2INDEX (row, col, n)] = val;
167+ }
168+
169+ template <class T >
170+ void allocNinitMem (T** p, long long size, double * memUsage) {
171+ *p = new T[size];
172+ memset (*p, 0 , sizeof (T) * size);
173+
174+ if (memUsage != NULL ) {
175+ *memUsage += sizeof (T) * size;
176+ }
177+ }
0 commit comments