Skip to content

Commit 47bd9f1

Browse files
Add Cuda implementation
1 parent fd96471 commit 47bd9f1

File tree

4 files changed

+332
-98
lines changed

4 files changed

+332
-98
lines changed

CMakeLists.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1-
cmake_minimum_required(VERSION 3.30)
2-
project(integral_image_prova)
1+
cmake_minimum_required(VERSION 3.13)
2+
project(integral_image LANGUAGES CXX CUDA)
33

44
set(CMAKE_CXX_STANDARD 20)
55

6-
add_executable(integral_image_prova main.cpp)
6+
find_package(CUDAToolkit REQUIRED)
7+
set(CUDA_ARCHITECTURES 86)
8+
9+
add_executable(integral_image main.cu)

kernel.cu

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#define BLOCK_DIM 32
2+
#define SECTION_SIZE 1024
3+
4+
__global__ void
5+
SinglePassKoggeStoneScan(const unsigned int* input,
6+
unsigned int* output,
7+
const unsigned int length,
8+
unsigned int* flags,
9+
unsigned int* scanValue,
10+
unsigned int* blockCounter)
11+
{
12+
__shared__ unsigned int bid_s;
13+
__shared__ unsigned int XY[SECTION_SIZE];
14+
15+
if (threadIdx.x == 0)
16+
{
17+
bid_s = atomicAdd(blockCounter, 1);
18+
}
19+
__syncthreads();
20+
21+
const int bid = bid_s;
22+
const int idx = bid * blockDim.x + threadIdx.x;
23+
24+
if (idx < length)
25+
{
26+
XY[threadIdx.x] = input[idx];
27+
}
28+
else
29+
{
30+
XY[threadIdx.x] = 0;
31+
}
32+
__syncthreads();
33+
34+
for (int stride = 1; stride < SECTION_SIZE; stride *= 2)
35+
{
36+
__syncthreads();
37+
float tmp = 0;
38+
if (threadIdx.x >= stride)
39+
{
40+
tmp = XY[threadIdx.x] + XY[threadIdx.x - stride];
41+
}
42+
__syncthreads();
43+
if (threadIdx.x >= stride)
44+
{
45+
XY[threadIdx.x] = tmp;
46+
}
47+
}
48+
__syncthreads();
49+
50+
__shared__ unsigned int previousSum;
51+
if (threadIdx.x == 0)
52+
{
53+
while (bid >= 1 && atomicAdd(&flags[bid], 0) == 0)
54+
{
55+
// Attende i dati
56+
}
57+
previousSum = scanValue[bid];
58+
scanValue[bid + 1] = XY[blockDim.x - 1] + previousSum;
59+
__threadfence();
60+
atomicAdd(&flags[bid + 1], 1);
61+
}
62+
__syncthreads();
63+
64+
if (idx < length)
65+
{
66+
output[idx] = XY[threadIdx.x] + previousSum;
67+
}
68+
}
69+
70+
__global__ void
71+
Transpose(const unsigned int* input,
72+
unsigned int* output,
73+
const unsigned int height,
74+
const unsigned int width)
75+
{
76+
__shared__ float block[BLOCK_DIM][BLOCK_DIM + 1];
77+
78+
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
79+
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
80+
if ((xIndex < width) && (yIndex < height))
81+
{
82+
const unsigned int index_in = yIndex * width + xIndex;
83+
block[threadIdx.y][threadIdx.x] = input[index_in];
84+
}
85+
86+
__syncthreads();
87+
88+
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
89+
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
90+
if ((xIndex < height) && (yIndex < width))
91+
{
92+
const unsigned int index_out = yIndex * height + xIndex;
93+
output[index_out] = block[threadIdx.x][threadIdx.y];
94+
}
95+
}

main.cpp

Lines changed: 0 additions & 95 deletions
This file was deleted.

0 commit comments

Comments
 (0)