@@ -50,12 +50,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
50
50
void main() {
51
51
const int out_limits_scaled[2 ] = {out_limits.x + (TILE_SIZE_X - 1 ) * TILE_SIZE_X, out_limits.y + (TILE_SIZE_Y - 1 ) * TILE_SIZE_Y};
52
52
53
- const int div_by_x = int (gl_GlobalInvocationID.x / out_limits_scaled[0 ]);
54
- const int out_pos[3 ] = {int (gl_GlobalInvocationID.x % out_limits_scaled[0 ]), div_by_x, int (gl_GlobalInvocationID.y)};
53
+ const uint16_t div_by_x = uint16_t(gl_GlobalInvocationID.x / out_limits_scaled[0 ]);
54
+ const uint16_t out_pos_xy[2 ] = {uint16_t(gl_GlobalInvocationID.x % out_limits_scaled[0 ]), div_by_x};
55
+ const int out_pos_z = int (gl_GlobalInvocationID.y);
55
56
56
57
// If the top left position is out of bounds, then this invocation will have
57
58
// no work to do.
58
- if (out_pos [1 ] >= out_limits_scaled[1 ] || out_pos[ 2 ] >= out_limits.z) {
59
+ if (out_pos_xy [1 ] >= out_limits_scaled[1 ] || out_pos_z >= out_limits.z) {
59
60
return ;
60
61
}
61
62
@@ -68,8 +69,8 @@ void main() {
68
69
uint16_t pos[TILE_SIZE_X * TILE_SIZE_Y * 2 ];
69
70
for (uint16_t y = uint16_t(0 ), i = uint16_t(0 ); y < TILE_SIZE_Y; ++ y) {
70
71
for (uint16_t x = uint16_t(0 ); x < TILE_SIZE_X; ++ x) {
71
- pos[i * 2 ] = uint16_t(out_pos [0 ]) * TILE_SIZE_X + x;
72
- pos[i * 2 + 1 ] = uint16_t(out_pos [1 ]) * TILE_SIZE_Y + y;
72
+ pos[i * 2 ] = out_pos_xy [0 ] * TILE_SIZE_X + x;
73
+ pos[i * 2 + 1 ] = out_pos_xy [1 ] * TILE_SIZE_Y + y;
73
74
i++ ;
74
75
}
75
76
}
@@ -78,7 +79,7 @@ void main() {
78
79
// Tuple of consecutive 4 elements represents a single output texel.
79
80
float sum[TILE_SIZE_X * TILE_SIZE_Y * 4 ];
80
81
81
- const vec4 bias = texelFetch(t_bias, ivec2 (out_pos[ 2 ] , 0 ), 0 );
82
+ const vec4 bias = texelFetch(t_bias, ivec2 (out_pos_z , 0 ), 0 );
82
83
83
84
// Initialize the output array with the bias value
84
85
for (int i = 0 ; i < TILE_SIZE_X * TILE_SIZE_Y * 4 ; i += 4 ) {
@@ -98,7 +99,7 @@ void main() {
98
99
99
100
// Load kernel values from texels to array
100
101
[[unroll]] for (int i = 0 ; i < 4 ; ++ i) {
101
- const vec4 k_tex = texelFetch(t_kernel, ivec2 (z + i, out_pos[ 2 ] ), 0 );
102
+ const vec4 k_tex = texelFetch(t_kernel, ivec2 (z + i, out_pos_z ), 0 );
102
103
kernel_values[i * 4 + 0 ] = k_tex.x;
103
104
kernel_values[i * 4 + 1 ] = k_tex.y;
104
105
kernel_values[i * 4 + 2 ] = k_tex.z;
@@ -157,8 +158,8 @@ void main() {
157
158
}
158
159
159
160
for (int i = 0 ; i < TILE_SIZE_X * TILE_SIZE_Y; ++ i) {
160
- const ivec3 pos_l = ivec3 (pos[i * 2 ], pos[i * 2 + 1 ], out_pos[ 2 ] );
161
- if (all (lessThan (pos_l, out_limits.xyz ))) {
161
+ const ivec3 pos_l = ivec3 (pos[i * 2 ], pos[i * 2 + 1 ], out_pos_z );
162
+ if (all (lessThan (pos_l.xy , out_limits.xy ))) {
162
163
imageStore(t_out, pos_l, op(vec4 (sum[i * 4 ], sum[i * 4 + 1 ], sum[i * 4 + 2 ], sum[i * 4 + 3 ]), out_min, out_max));
163
164
}
164
165
}
0 commit comments