Skip to content

Commit 3241aa1

Browse files
committed
Add 7_4
1 parent b06d9e8 commit 3241aa1

File tree

7 files changed

+1028
-0
lines changed

7 files changed

+1028
-0
lines changed

Book_BJ/Chap7/7_4_MatMul/7_4.cu

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#include "cuda_runtime.h"
2+
#include "device_launch_parameters.h"
3+
4+
#include "DS_timer.h"
5+
#include <stdio.h>
6+
#include <stdlib.h>
7+
#include <string.h>
8+
9+
#define DO_CPU
10+
#define DATA_TYEP int
11+
12+
#define SIZE_M (512*2)
13+
#define SIZE_N (512*4)
14+
#define SIZE_K (512*2)
15+
16+
#define INDEX2ROW(_index,_width) (int)((_index)/(_width))
17+
#define INDEX2COL(_index,_width) ((_index)%(_width))
18+
#define ID2INDEX(_row,_col, _width) (((_row)*(_width))+(_col))
19+
20+
#define BLOCK_SIZE 16
21+
22+
// Macro function
23+
//#define KERNEL_MUL(_a,_b) __fmul_rn(_a,_b)
24+
#define KERNEL_MUL(_a,_b) (_a*_b)
25+
26+
// kernel declarations
27+
__global__ void MatMul(DATA_TYEP* matA, DATA_TYEP* matB, DATA_TYEP* matC, int m, int n, int k);
28+
29+
template<class T> void allocNinitMem(T** p, long long size, double* memUsage = NULL);
30+
bool compareMatrix(DATA_TYEP* _A, DATA_TYEP* _B, int _size);
31+
32+
int main(int argc, char* argv[])
33+
{
34+
DS_timer timer(10);
35+
timer.setTimerName(0, (char*)"CPU algorithm");
36+
timer.setTimerName(1, (char*)"GPU/CUDA algorithm");
37+
timer.setTimerName(2, (char*)" - Kernel");
38+
timer.setTimerName(4, (char*)" - [Data transter] host->device");
39+
timer.setTimerName(5, (char*)" - [Data transfer] device->host");
40+
41+
// set matrix size
42+
int m, n, k;
43+
44+
if (argc < 3) { m = SIZE_M; n = SIZE_N; k = SIZE_K; }
45+
else { m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); }
46+
47+
printf("Size : A = (%d by %d), B = (%d by %d), C = (%d by %d)\n", m, k, k, n, m, n);
48+
49+
int sizeA = m * k;
50+
int sizeB = k * n;
51+
int sizeC = m * n;
52+
53+
// Make matrix
54+
DATA_TYEP* A = NULL, * B = NULL;
55+
allocNinitMem<DATA_TYEP>(&A, sizeA);
56+
allocNinitMem<DATA_TYEP>(&B, sizeB);
57+
58+
DATA_TYEP* Ccpu = NULL, * Cgpu = NULL;
59+
allocNinitMem<DATA_TYEP>(&Ccpu, sizeC);
60+
allocNinitMem<DATA_TYEP>(&Cgpu, sizeC);
61+
62+
// generate input matrices
63+
for (int i = 0; i < sizeA; i++) A[i] = ((rand() % 10) + ((rand() % 100) / 100.0));
64+
for (int i = 0; i < sizeB; i++) B[i] = ((rand() % 10) + ((rand() % 100) / 100.0));
65+
66+
#ifdef DO_CPU // CPU version (OpenMP)
67+
timer.onTimer(0);
68+
#pragma omp parallel for num_threads(4)
69+
for (int row = 0; row < m; row++) {
70+
for (int col = 0; col < n; col++) {
71+
int cIndex = ID2INDEX(row, col, n);
72+
Ccpu[cIndex] = 0;
73+
for (int i = 0; i < k; i++)
74+
Ccpu[cIndex] += (A[ID2INDEX(row, i, k)] * B[ID2INDEX(i, col, n)]);
75+
}
76+
}
77+
printf("CPU finished!\n");
78+
timer.offTimer(0);
79+
#endif
80+
81+
// GPU setup
82+
DATA_TYEP* dA, * dB, * dC;
83+
84+
cudaMalloc(&dA, sizeA * sizeof(DATA_TYEP));
85+
cudaMemset(dA, 0, sizeA * sizeof(DATA_TYEP));
86+
87+
cudaMalloc(&dB, sizeB * sizeof(DATA_TYEP));
88+
cudaMemset(dB, 0, sizeB * sizeof(DATA_TYEP));
89+
90+
cudaMalloc(&dC, sizeC * sizeof(DATA_TYEP));
91+
cudaMemset(dC, 0, sizeC * sizeof(DATA_TYEP));
92+
93+
timer.onTimer(1);
94+
95+
timer.onTimer(4);
96+
cudaMemcpy(dA, A, sizeA * sizeof(DATA_TYEP), cudaMemcpyHostToDevice);
97+
cudaMemcpy(dB, B, sizeB * sizeof(DATA_TYEP), cudaMemcpyHostToDevice);
98+
timer.offTimer(4);
99+
100+
dim3 gridDim(ceil((float)m / BLOCK_SIZE), ceil((float)n / BLOCK_SIZE));
101+
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
102+
103+
printf("Grid(%d, %d), Block(%d, %d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);
104+
105+
// GPU version
106+
timer.onTimer(2);
107+
MatMul << < gridDim, blockDim >> > (dA, dB, dC, m, n, k);
108+
cudaDeviceSynchronize();
109+
timer.offTimer(2);
110+
111+
timer.onTimer(5);
112+
cudaMemcpy(Cgpu, dC, sizeC * sizeof(DATA_TYEP), cudaMemcpyDeviceToHost);
113+
timer.offTimer(5);
114+
115+
timer.offTimer(1);
116+
117+
cudaFree(dA);
118+
cudaFree(dB);
119+
cudaFree(dC);
120+
121+
#ifdef DO_CPU
122+
printf("[Kernel basic] ");
123+
compareMatrix(Ccpu, Cgpu, sizeC);
124+
#endif
125+
126+
timer.printTimer(1);
127+
128+
delete A;
129+
delete B;
130+
delete Ccpu;
131+
delete Cgpu;
132+
133+
return 0;
134+
}
135+
136+
bool compareMatrix(DATA_TYEP* _A, DATA_TYEP* _B, int _size)
137+
{
138+
bool isMatched = true;
139+
for (int i = 0; i < _size; i++) {
140+
if (_A[i] != _B[i]) {
141+
printf("[%d] not matched! (%f, %f)\n", i, _A[i], _B[i]);
142+
getchar();
143+
isMatched = false;
144+
}
145+
}
146+
if (isMatched)
147+
printf("Results are matched!\n");
148+
else
149+
printf("Results are not matched!!!!!!!!!!!\n");
150+
151+
return isMatched;
152+
}
153+
154+
__global__ void MatMul(DATA_TYEP* matA, DATA_TYEP* matB, DATA_TYEP* matC, int m, int n, int k)
155+
{
156+
int row = blockDim.x * blockIdx.x + threadIdx.x;
157+
int col = blockDim.y * blockIdx.y + threadIdx.y;
158+
159+
if (row >= m || col >= n)
160+
return;
161+
162+
DATA_TYEP val = 0; // hope to use register
163+
for (int i = 0; i < k; i++)
164+
val += KERNEL_MUL(matA[ID2INDEX(row, i, k)], matB[ID2INDEX(i, col, n)]);
165+
166+
matC[ID2INDEX(row, col, n)] = val;
167+
}
168+
169+
template<class T>
170+
void allocNinitMem(T** p, long long size, double* memUsage) {
171+
*p = new T[size];
172+
memset(*p, 0, sizeof(T) * size);
173+
174+
if (memUsage != NULL) {
175+
*memUsage += sizeof(T) * size;
176+
}
177+
}
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
#pragma once
2+
3+
#include <stdio.h>
4+
#include <stdlib.h>
5+
#include <string.h>
6+
#include <iostream>
7+
8+
#define OS_WINDOWS 0
9+
#define OS_LINUX 1
10+
11+
#ifdef _WIN32
12+
#define _TARGET_OS OS_WINDOWS
13+
#else
14+
#ifndef nullptr
15+
#define nullptr NULL
16+
#endif
17+
#define _TARGET_OS OS_LINUX
18+
#endif
19+
20+
/************************************************************************/
21+
/* OS dependet function */
22+
/************************************************************************/
23+
#if _TARGET_OS == OS_WINDOWS
24+
// #define _SPRINT sprintf_s
25+
#define _STRTOK strtok_s
26+
27+
#define EXIT_WIHT_KEYPRESS {std::cout << "Press any key to exit..."; getchar(); exit(0);}
28+
29+
#define SPLIT_PATH(_path,_result) \
30+
_splitpath_s(_path, _result.drive, 255, _result.dir, 255, _result.filename, 255, _result.ext, 255)
31+
32+
33+
#elif _TARGET_OS == OS_LINUX
34+
#include <libgen.h>
35+
#include <inttypes.h>
36+
37+
#define _STRTOK strtok_r
38+
39+
#define EXIT_WIHT_KEYPRESS {std::cout << "Program was terminated!"; exit(0);}
40+
41+
#define sprintf_s sprintf
42+
#define scanf_s scanf
43+
#define fprintf_s fprintf
44+
45+
#define __int64 int64_t
46+
47+
#define fopen_s(fp, name, mode) (*fp = fopen(name, mode))
48+
49+
#endif
50+
51+
/************************************************************************/
52+
/* Defines */
53+
/************************************************************************/
54+
55+
// *********** data size
56+
#define _1K_ 1024
57+
#define _1M_ (_1K_*_1K_)
58+
#define _1G_ (_1M_*_1K_)
59+
60+
#define CHAR_STRING_SIZE 255
61+
62+
/************************************************************************/
63+
/* Type definitions */
64+
/************************************************************************/
65+
typedef unsigned int UINT ;
66+
67+
/************************************************************************/
68+
/* Macro functions */
69+
/************************************************************************/
70+
#define DS_MEM_DELETE(a) \
71+
if (a != NULL) { \
72+
delete a ; \
73+
a = NULL ; \
74+
}
75+
76+
#define DS_MEM_DELETE_ARRAY(a) \
77+
if (a != NULL) { \
78+
delete [] a ; \
79+
a = NULL ; \
80+
}
81+
82+
#define RANGE_MIN 0
83+
#define RANGE_MAX 1
84+
85+
#define MATCHED_STRING 0
86+
87+
#ifndef VTK_RANGE_MIN
88+
#define VTK_RANGE_MIN 0
89+
#define VTK_RANGE_MAX 1
90+
#endif
91+
92+
// Print
93+
#define PRINT_LINE_INFO printf("%s, line %d", __FILE__, __LINE__)
94+
#define PRINT_ERROR_MSG(_msg) {PRINT_LINE_INFO; printf(" at "); printf(_msg);}
95+
96+
// Single loops
97+
#define LOOP_I(a) for(int i=0; i<a; i++)
98+
#define LOOP_J(a) for(int j=0; j<a; j++)
99+
#define LOOP_K(a) for(int k=0; k<a; k++)
100+
#define LOOP_INDEX(index, end) for (int index = 0 ; index < end ; index++)
101+
#define LOOP_INDEX_START_END(index, start, end) for (int index = start ; index < end ; index++)
102+
103+
// Multiple loops
104+
#define LOOP_J_I(b, a) LOOP_J(b) LOOP_I(a)
105+
#define LOOP_K_J_I(c,b,a) for(int k=0; k<c; k++) LOOP_J_I(b,a)
106+
107+
//
108+
#ifndef SWAP
109+
template<class T>
110+
void SWAP(T &a, T &b){
111+
T tmp = a;
112+
a = b;
113+
b = tmp;
114+
}
115+
#endif
116+
117+
//
118+
#ifndef MIN
119+
#define MIN(a,b) (a > b ? b : a)
120+
#endif
121+
122+
#ifndef MAX
123+
#define MAX(a,b) (a > b ? a : b)
124+
#endif
125+
126+
// Index converter
127+
128+
#define INDEX2X(_ID,_W) (_ID%_W)
129+
#define INDEX2Y(_ID,_W) (_ID/_W)
130+
#define INDEX2ID(_ID,_X,_Y,_W) {_X=INDEX2X(_ID,_W);_Y=INDEX2Y(_ID_,_W);}
131+
#define ID2INDEX(_W,_X,_Y) (_Y*_W+_X)
132+
#define PTR2ID(_type, _target, _base) ((_type*)_target - (_type*)_base)
133+
134+
// Memory allocation and release
135+
#ifndef SAFE_DELETE
136+
#define SAFE_DELETE(p) {if(p!=NULL) delete p; p=NULL;}
137+
#endif
138+
139+
#ifndef SAFE_DELETE_ARR
140+
#define SAFE_DELETE_ARR(p) {if(p!=NULL) delete [] p; p=NULL;}
141+
#endif
142+
143+
#define SAFE_NEW(p, type, size) {\
144+
try {p = new type[size];} \
145+
catch(std::bad_alloc& exc) \
146+
{ printf("[%s, line %d] fail to memory allocation - %.2f MB requested\n", __FILE__, __LINE__, (float)(sizeof(type)*size)/_1M_); \
147+
EXIT_WIHT_KEYPRESS }\
148+
}
149+
150+
template<class T>
151+
void memsetZero(T** p, long long size = 0) {
152+
if (*p != NULL)
153+
memset(*p, 0, sizeof(T)*size);
154+
}
155+
156+
template<class T>
157+
void allocNinitMem(T** p, long long size, double *memUsage = NULL) {
158+
*p = new T[size];
159+
//SAFE_NEW(*p, T, size);
160+
memset(*p, 0, sizeof(T)*size);
161+
162+
if (memUsage != NULL) {
163+
*memUsage += sizeof(T)*size;
164+
}
165+
}
166+
167+
#define SAFE_MEMCPY(_dst, _src, _type, _size){ \
168+
if(_dst == nullptr || _src == nullptr ) \
169+
printf("[%s, line %d] fail to memcpy (dst = %x, src = %x)\n", __FILE__, __LINE__, _dst, _src); \
170+
exit(-1); \
171+
memcpy(_dst, _src, sizeof(_type)*_size);\
172+
}
173+
174+
// VTK related
175+
#ifndef SAFE_DELETE_VTK
176+
#define SAFE_DELETE_VTK(p) {if(p!=NULL) p->Delete(); p=NULL;}
177+
#endif
178+
179+
#ifndef VTK_IS_NOERROR
180+
//#include "DS_common_def.h"
181+
#define VTK_IS_NOERROR(p) (p->GetErrorCode()==vtkErrorCode::NoError ? true : false)
182+
#endif
183+
184+
/************************************************************************/
185+
/* Data structures */
186+
/************************************************************************/
187+
typedef struct {
188+
std::string input;
189+
std::string output;
190+
} nameMatch;
191+
192+
typedef struct {
193+
char drive[255];
194+
char dir[255];
195+
char filename[255];
196+
char ext[255];
197+
} filePathSplit;

0 commit comments

Comments
 (0)