-
Notifications
You must be signed in to change notification settings - Fork 0
/
V2_gpu1ThreadPerBlockOfMoments_slow.cu
97 lines (80 loc) · 3.36 KB
/
V2_gpu1ThreadPerBlockOfMoments_slow.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <stdlib.h>
#include <string.h>
#include "ising.h"
// CUDA Kernel
__global__ void computeMoment(int8_t *readArr, int8_t *writeArr, float *weightArr, int n, int tileSize){
int row_init = blockIdx.x*blockDim.x + threadIdx.x;
int col_init = blockIdx.y*blockDim.y + threadIdx.y;
// Assign each thread a tileSizeXtileSize tile
for(int ii=0; ii<tileSize; ++ii){
for (int jj=0; jj<tileSize; ++jj){
int row = row_init + ii*tileSize;
int col = col_init + jj*tileSize;
// If coordinates are between boundaries
// update the write array accordingly
if(row < n && col < n){
float influence = 0.0f;
for (int i=-2; i<3; i++)
{
for (int j=-2; j<3; j++)
{
//add extra n so that modulo behaves like mathematics modulo
//that is return only positive values
int y = (row+i+n)%n;
int x = (col+j+n)%n;
influence += weightArr[i*5 + j]*readArr[y*n + x];
}
}
writeArr[row*n + col] = readArr[row*n + col];
if (influence<-diff) writeArr[row*n + col] = -1;
else if (influence>diff) writeArr[row*n + col] = 1;
__syncthreads();
}
}
}
}
void ising(int8_t *G, float *w, int k, int n)
{
// Allocate memory for the 3 arrays with cudaMallocManaged()
// because they will be used inside the kernel
// The return err values are for debugging only
int8_t *readArr, *writeArr;
cudaError_t err1 = cudaMallocManaged(&readArr, n*n*sizeof(int8_t));
cudaError_t err2 = cudaMallocManaged(&writeArr,n*n*sizeof(int8_t));
float *weightArr_d;
cudaError_t er3 = cudaMallocManaged(&weightArr_d, 5*5*sizeof(float));
// Copy the contents of input arrays inside
// the ones we will use inside kernel
memcpy(readArr, G, n*n*sizeof(int8_t));
memcpy(weightArr_d, w, 5*5*sizeof(float));
//set valid indexes to [-2..2][-2..2]
weightArr_d = &weightArr_d[2*5 + 2];
weightArr_d[0] = 0.0;
// Define the thread tile size, that is the size of the block of
// moments a single thread will calculate. Set it to 5x5
int tileSize = 5;
for (int i=1; i<=k; i++)
{
// Create blocks of size 32x32 threads per block
// The number of blocks will adjust to fit the input n
dim3 dimBlock(32, 32);
int gridSz = (n + 32*tileSize)/ 32*tileSize;
dim3 dimGrid(gridSz, gridSz);
// Run the kernel in GPU
computeMoment<<<dimGrid, dimBlock>>> (readArr, writeArr, weightArr_d, n, tileSize);
// Uncomment below to check for launch errors
//printf("%s\n", cudaGetErrorString(cudaGetLastError()));
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Swap read and write arrays
int8_t *temp = readArr;
readArr = writeArr;
writeArr = temp;
}
//The final result now is in readArr. Copy the contents
// in array G
memcpy(G, readArr, n*n*sizeof(int8_t));
cudaFree( readArr );
cudaFree( writeArr );
cudaFree( weightArr_d );
}