1
+ #define BLOCK_DIM 32
2
+ #define SECTION_SIZE 1024
3
+
4
+ __global__ void
5
+ SinglePassKoggeStoneScan (const unsigned int * input,
6
+ unsigned int * output,
7
+ const unsigned int length,
8
+ unsigned int * flags,
9
+ unsigned int * scanValue,
10
+ unsigned int * blockCounter)
11
+ {
12
+ __shared__ unsigned int bid_s;
13
+ __shared__ unsigned int XY[SECTION_SIZE];
14
+
15
+ if (threadIdx .x == 0 )
16
+ {
17
+ bid_s = atomicAdd (blockCounter, 1 );
18
+ }
19
+ __syncthreads ();
20
+
21
+ const int bid = bid_s;
22
+ const int idx = bid * blockDim .x + threadIdx .x ;
23
+
24
+ if (idx < length)
25
+ {
26
+ XY[threadIdx .x ] = input[idx];
27
+ }
28
+ else
29
+ {
30
+ XY[threadIdx .x ] = 0 ;
31
+ }
32
+ __syncthreads ();
33
+
34
+ for (int stride = 1 ; stride < SECTION_SIZE; stride *= 2 )
35
+ {
36
+ __syncthreads ();
37
+ float tmp = 0 ;
38
+ if (threadIdx .x >= stride)
39
+ {
40
+ tmp = XY[threadIdx .x ] + XY[threadIdx .x - stride];
41
+ }
42
+ __syncthreads ();
43
+ if (threadIdx .x >= stride)
44
+ {
45
+ XY[threadIdx .x ] = tmp;
46
+ }
47
+ }
48
+ __syncthreads ();
49
+
50
+ __shared__ unsigned int previousSum;
51
+ if (threadIdx .x == 0 )
52
+ {
53
+ while (bid >= 1 && atomicAdd (&flags[bid], 0 ) == 0 )
54
+ {
55
+ // Attende i dati
56
+ }
57
+ previousSum = scanValue[bid];
58
+ scanValue[bid + 1 ] = XY[blockDim .x - 1 ] + previousSum;
59
+ __threadfence ();
60
+ atomicAdd (&flags[bid + 1 ], 1 );
61
+ }
62
+ __syncthreads ();
63
+
64
+ if (idx < length)
65
+ {
66
+ output[idx] = XY[threadIdx .x ] + previousSum;
67
+ }
68
+ }
69
+
70
+ __global__ void
71
+ Transpose (const unsigned int * input,
72
+ unsigned int * output,
73
+ const unsigned int height,
74
+ const unsigned int width)
75
+ {
76
+ __shared__ float block[BLOCK_DIM][BLOCK_DIM + 1 ];
77
+
78
+ unsigned int xIndex = blockIdx .x * BLOCK_DIM + threadIdx .x ;
79
+ unsigned int yIndex = blockIdx .y * BLOCK_DIM + threadIdx .y ;
80
+ if ((xIndex < width) && (yIndex < height))
81
+ {
82
+ const unsigned int index_in = yIndex * width + xIndex;
83
+ block[threadIdx .y ][threadIdx .x ] = input[index_in];
84
+ }
85
+
86
+ __syncthreads ();
87
+
88
+ xIndex = blockIdx .y * BLOCK_DIM + threadIdx .x ;
89
+ yIndex = blockIdx .x * BLOCK_DIM + threadIdx .y ;
90
+ if ((xIndex < height) && (yIndex < width))
91
+ {
92
+ const unsigned int index_out = yIndex * height + xIndex;
93
+ output[index_out] = block[threadIdx .x ][threadIdx .y ];
94
+ }
95
+ }
0 commit comments