Skip to content

Commit bb4d6d0

Browse files
committed
Add 11-1
1 parent 4caa1e0 commit bb4d6d0

File tree

6 files changed

+866
-0
lines changed

6 files changed

+866
-0
lines changed
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#include "cuda_runtime.h"
2+
#include "device_launch_parameters.h"
3+
4+
#include "DS_timer.h"
5+
#include <stdio.h>
6+
#include <stdlib.h>
7+
#include <string.h>
8+
9+
#define SIZE_M (512*2)
10+
#define SIZE_N (512*4)
11+
#define SIZE_K (512*2)
12+
13+
#define BLOCK_SIZE 16
14+
15+
// kernels
16+
__global__ void MatMul_xRow(int* matA, int* matB, int* matC, int m, int n, int k)
17+
{
18+
int row = blockDim.x * blockIdx.x + threadIdx.x;
19+
int col = blockDim.y * blockIdx.y + threadIdx.y;
20+
21+
if (row >= m || col >= n)
22+
return;
23+
24+
int val = 0;
25+
for (int i = 0; i < k; i++)
26+
val += matA[row * k + i] * matB[i * n + col];
27+
28+
matC[row * n + col, n] = val;
29+
}
30+
31+
__global__ void MatMul_yRow(int* matA, int* matB, int* matC, int m, int n, int k)
32+
{
33+
int row = blockDim.y * blockIdx.y + threadIdx.y;
34+
int col = blockDim.x * blockIdx.x + threadIdx.x;
35+
36+
if (row >= m || col >= n)
37+
return;
38+
39+
int val = 0;
40+
for (int i = 0; i < k; i++)
41+
val += matA[row * k + i] * matB[i * n + col];
42+
43+
matC[row * n + col, n] = val;
44+
}
45+
46+
template<class T> void allocNinitMem(T** p, long long size, double* memUsage = NULL);
47+
bool compareMatrix(int* _A, int* _B, int _size);
48+
49+
int main(int argc, char* argv[])
50+
{
51+
DS_timer timer(10);
52+
timer.setTimerName(1, (char*)" - X-dim = Row Kernel");
53+
timer.setTimerName(2, (char*)" - Y-dim = Row Kernel");
54+
55+
// set matrix size
56+
int m, n, k;
57+
58+
if (argc < 3) { m = SIZE_M; n = SIZE_N; k = SIZE_K; }
59+
else { m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); }
60+
61+
printf("Size : A = (%d by %d), B = (%d by %d), C = (%d by %d)\n", m, k, k, n, m, n);
62+
63+
int sizeA = m * k;
64+
int sizeB = k * n;
65+
int sizeC = m * n;
66+
67+
// Make matrix
68+
int* A = NULL, * B = NULL;
69+
allocNinitMem<int>(&A, sizeA);
70+
allocNinitMem<int>(&B, sizeB);
71+
72+
int* Ccpu = NULL, * Cgpu = NULL;
73+
allocNinitMem<int>(&Ccpu, sizeC);
74+
allocNinitMem<int>(&Cgpu, sizeC);
75+
76+
// generate input matrices
77+
for (int i = 0; i < sizeA; i++) A[i] = ((rand() % 10) + ((rand() % 100) / 100.0));
78+
for (int i = 0; i < sizeB; i++) B[i] = ((rand() % 10) + ((rand() % 100) / 100.0));
79+
80+
// GPU setup
81+
int* dA, * dB, * dC;
82+
83+
cudaMalloc(&dA, sizeA * sizeof(int));
84+
cudaMemset(dA, 0, sizeA * sizeof(int));
85+
86+
cudaMalloc(&dB, sizeB * sizeof(int));
87+
cudaMemset(dB, 0, sizeB * sizeof(int));
88+
89+
cudaMalloc(&dC, sizeC * sizeof(int));
90+
cudaMemset(dC, 0, sizeC * sizeof(int));
91+
92+
cudaMemcpy(dA, A, sizeA * sizeof(int), cudaMemcpyHostToDevice);
93+
cudaMemcpy(dB, B, sizeB * sizeof(int), cudaMemcpyHostToDevice);
94+
95+
// Row = X-dim version
96+
timer.onTimer(1);
97+
dim3 gridDim_xRow(ceil((float)m / BLOCK_SIZE), ceil((float)n / BLOCK_SIZE));
98+
dim3 blockDim_xRow(BLOCK_SIZE, BLOCK_SIZE);
99+
MatMul_xRow << <gridDim_xRow, blockDim_xRow >> > (dA, dB, dC, m, n, k);
100+
cudaDeviceSynchronize();
101+
timer.offTimer(1);
102+
103+
// Row = Y-dim version
104+
timer.onTimer(2);
105+
dim3 gridDim_yRow(ceil((float)n / BLOCK_SIZE), ceil((float)m / BLOCK_SIZE));
106+
dim3 blockDim_yRow(BLOCK_SIZE, BLOCK_SIZE);
107+
MatMul_yRow << <gridDim_yRow, blockDim_yRow >> > (dA, dB, dC, m, n, k);
108+
cudaDeviceSynchronize();
109+
timer.offTimer(2);
110+
111+
cudaMemcpy(Cgpu, dC, sizeC * sizeof(int), cudaMemcpyDeviceToHost);
112+
113+
cudaFree(dA);
114+
cudaFree(dB);
115+
cudaFree(dC);
116+
117+
timer.printTimer(1);
118+
119+
delete A;
120+
delete B;
121+
delete Ccpu;
122+
delete Cgpu;
123+
124+
return 0;
125+
}
126+
127+
template<class T>
128+
void allocNinitMem(T** p, long long size, double* memUsage) {
129+
*p = new T[size];
130+
memset(*p, 0, sizeof(T) * size);
131+
132+
if (memUsage != NULL) {
133+
*memUsage += sizeof(T) * size;
134+
}
135+
}
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
#pragma once
2+
3+
#include <stdio.h>
4+
#include <stdlib.h>
5+
#include <string.h>
6+
#include <iostream>
7+
8+
#define OS_WINDOWS 0
9+
#define OS_LINUX 1
10+
11+
#ifdef _WIN32
12+
#define _TARGET_OS OS_WINDOWS
13+
#else
14+
#ifndef nullptr
15+
#define nullptr NULL
16+
#endif
17+
#define _TARGET_OS OS_LINUX
18+
#endif
19+
20+
/************************************************************************/
21+
/* OS dependet function */
22+
/************************************************************************/
23+
#if _TARGET_OS == OS_WINDOWS
24+
// #define _SPRINT sprintf_s
25+
#define _STRTOK strtok_s
26+
27+
#define EXIT_WIHT_KEYPRESS {std::cout << "Press any key to exit..."; getchar(); exit(0);}
28+
29+
#define SPLIT_PATH(_path,_result) \
30+
_splitpath_s(_path, _result.drive, 255, _result.dir, 255, _result.filename, 255, _result.ext, 255)
31+
32+
33+
#elif _TARGET_OS == OS_LINUX
34+
#include <libgen.h>
35+
#include <inttypes.h>
36+
37+
#define _STRTOK strtok_r
38+
39+
#define EXIT_WIHT_KEYPRESS {std::cout << "Program was terminated!"; exit(0);}
40+
41+
#define sprintf_s sprintf
42+
#define scanf_s scanf
43+
#define fprintf_s fprintf
44+
45+
#define __int64 int64_t
46+
47+
#define fopen_s(fp, name, mode) (*fp = fopen(name, mode))
48+
49+
#endif
50+
51+
/************************************************************************/
52+
/* Defines */
53+
/************************************************************************/
54+
55+
// *********** data size
56+
#define _1K_ 1024
57+
#define _1M_ (_1K_*_1K_)
58+
#define _1G_ (_1M_*_1K_)
59+
60+
#define CHAR_STRING_SIZE 255
61+
62+
/************************************************************************/
63+
/* Type definitions */
64+
/************************************************************************/
65+
typedef unsigned int UINT ;
66+
67+
/************************************************************************/
68+
/* Macro functions */
69+
/************************************************************************/
70+
#define DS_MEM_DELETE(a) \
71+
if (a != NULL) { \
72+
delete a ; \
73+
a = NULL ; \
74+
}
75+
76+
#define DS_MEM_DELETE_ARRAY(a) \
77+
if (a != NULL) { \
78+
delete [] a ; \
79+
a = NULL ; \
80+
}
81+
82+
#define RANGE_MIN 0
83+
#define RANGE_MAX 1
84+
85+
#define MATCHED_STRING 0
86+
87+
#ifndef VTK_RANGE_MIN
88+
#define VTK_RANGE_MIN 0
89+
#define VTK_RANGE_MAX 1
90+
#endif
91+
92+
// Print
93+
#define PRINT_LINE_INFO printf("%s, line %d", __FILE__, __LINE__)
94+
#define PRINT_ERROR_MSG(_msg) {PRINT_LINE_INFO; printf(" at "); printf(_msg);}
95+
96+
// Single loops
97+
#define LOOP_I(a) for(int i=0; i<a; i++)
98+
#define LOOP_J(a) for(int j=0; j<a; j++)
99+
#define LOOP_K(a) for(int k=0; k<a; k++)
100+
#define LOOP_INDEX(index, end) for (int index = 0 ; index < end ; index++)
101+
#define LOOP_INDEX_START_END(index, start, end) for (int index = start ; index < end ; index++)
102+
103+
// Multiple loops
104+
#define LOOP_J_I(b, a) LOOP_J(b) LOOP_I(a)
105+
#define LOOP_K_J_I(c,b,a) for(int k=0; k<c; k++) LOOP_J_I(b,a)
106+
107+
//
108+
#ifndef SWAP
109+
template<class T>
110+
void SWAP(T &a, T &b){
111+
T tmp = a;
112+
a = b;
113+
b = tmp;
114+
}
115+
#endif
116+
117+
//
118+
#ifndef MIN
119+
#define MIN(a,b) (a > b ? b : a)
120+
#endif
121+
122+
#ifndef MAX
123+
#define MAX(a,b) (a > b ? a : b)
124+
#endif
125+
126+
// Index converter
127+
128+
#define INDEX2X(_ID,_W) (_ID%_W)
129+
#define INDEX2Y(_ID,_W) (_ID/_W)
130+
#define INDEX2ID(_ID,_X,_Y,_W) {_X=INDEX2X(_ID,_W);_Y=INDEX2Y(_ID_,_W);}
131+
#define ID2INDEX(_W,_X,_Y) (_Y*_W+_X)
132+
#define PTR2ID(_type, _target, _base) ((_type*)_target - (_type*)_base)
133+
134+
// Memory allocation and release
135+
#ifndef SAFE_DELETE
136+
#define SAFE_DELETE(p) {if(p!=NULL) delete p; p=NULL;}
137+
#endif
138+
139+
#ifndef SAFE_DELETE_ARR
140+
#define SAFE_DELETE_ARR(p) {if(p!=NULL) delete [] p; p=NULL;}
141+
#endif
142+
143+
#define SAFE_NEW(p, type, size) {\
144+
try {p = new type[size];} \
145+
catch(std::bad_alloc& exc) \
146+
{ printf("[%s, line %d] fail to memory allocation - %.2f MB requested\n", __FILE__, __LINE__, (float)(sizeof(type)*size)/_1M_); \
147+
EXIT_WIHT_KEYPRESS }\
148+
}
149+
150+
template<class T>
151+
void memsetZero(T** p, long long size = 0) {
152+
if (*p != NULL)
153+
memset(*p, 0, sizeof(T)*size);
154+
}
155+
156+
template<class T>
157+
void allocNinitMem(T** p, long long size, double *memUsage = NULL) {
158+
*p = new T[size];
159+
//SAFE_NEW(*p, T, size);
160+
memset(*p, 0, sizeof(T)*size);
161+
162+
if (memUsage != NULL) {
163+
*memUsage += sizeof(T)*size;
164+
}
165+
}
166+
167+
#define SAFE_MEMCPY(_dst, _src, _type, _size){ \
168+
if(_dst == nullptr || _src == nullptr ) \
169+
printf("[%s, line %d] fail to memcpy (dst = %x, src = %x)\n", __FILE__, __LINE__, _dst, _src); \
170+
exit(-1); \
171+
memcpy(_dst, _src, sizeof(_type)*_size);\
172+
}
173+
174+
// VTK related
175+
#ifndef SAFE_DELETE_VTK
176+
#define SAFE_DELETE_VTK(p) {if(p!=NULL) p->Delete(); p=NULL;}
177+
#endif
178+
179+
#ifndef VTK_IS_NOERROR
180+
//#include "DS_common_def.h"
181+
#define VTK_IS_NOERROR(p) (p->GetErrorCode()==vtkErrorCode::NoError ? true : false)
182+
#endif
183+
184+
/************************************************************************/
185+
/* Data structures */
186+
/************************************************************************/
187+
typedef struct {
188+
std::string input;
189+
std::string output;
190+
} nameMatch;
191+
192+
typedef struct {
193+
char drive[255];
194+
char dir[255];
195+
char filename[255];
196+
char ext[255];
197+
} filePathSplit;

0 commit comments

Comments
 (0)