@@ -58,32 +58,28 @@ void main() {
58
58
return ;
59
59
}
60
60
61
- // If the top left position is out of bounds, then this invocation will have
62
- // no work to do.
63
- if (gpos.z >= out_limits.z) {
64
- return ;
65
- }
66
-
67
61
// Output position for TILE_SIZE = 2
68
62
// +--------+--------+
69
63
// | pos[0] | pos[1] |
70
64
// +--------+--------+
71
65
// | pos[2] | pos[3] |
72
66
// +--------+--------+
73
- ivec3 pos[TILE_SIZE_X * TILE_SIZE_Y];
67
+ int pos[TILE_SIZE_X * TILE_SIZE_Y * 2 ];
74
68
for (int y = 0 , i = 0 ; y < TILE_SIZE_Y; ++ y) {
75
69
for (int x = 0 ; x < TILE_SIZE_X; ++ x) {
76
- pos[i] = ivec3 (gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y, gpos.z);
70
+ pos[i * 2 ] = gpos.x * TILE_SIZE_X + x;
71
+ pos[i * 2 + 1 ] = gpos.y * TILE_SIZE_Y + y;
77
72
i++ ;
78
73
}
79
74
}
80
75
81
76
// Compute the index of the input texture that needs to be loaded for each
82
77
// output position. Note that negative indices can be produced indicating that
83
78
// the top-left element is in a region added by padding.
84
- ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y];
79
+ int ipos[TILE_SIZE_X * TILE_SIZE_Y * 2 ];
85
80
for (int i = 0 ; i < TILE_SIZE_X * TILE_SIZE_Y; ++ i) {
86
- ipos[i] = pos[i].xy * stride - padding;
81
+ ipos[i * 2 ] = pos[i * 2 ] * stride.x - padding.x;
82
+ ipos[i * 2 + 1 ] = pos[i * 2 + 1 ] * stride.y - padding.y;
87
83
}
88
84
89
85
// Final output array where each element is a tensor value.
@@ -118,7 +114,7 @@ void main() {
118
114
}
119
115
120
116
for (int i = 0 ; i < TILE_SIZE_X * TILE_SIZE_Y; ++ i) {
121
- const vec4 in_tex = texelFetch(t_in, ivec3 (ipos[i], z4), 0 );
117
+ const vec4 in_tex = texelFetch(t_in, ivec3 (ipos[i * 2 ], ipos[i * 2 + 1 ], z4), 0 );
122
118
// Load the input texel into an array
123
119
float tex_values[4 ];
124
120
tex_values[0 ] = in_tex.x;
@@ -169,8 +165,9 @@ void main() {
169
165
}
170
166
171
167
for (int i = 0 ; i < TILE_SIZE_X * TILE_SIZE_Y; ++ i) {
172
- if (all (lessThan (pos[i], out_limits.xyz))) {
173
- imageStore(t_out, pos[i], op(vec4 (sum[i * 4 ], sum[i * 4 + 1 ], sum[i * 4 + 2 ], sum[i * 4 + 3 ]), out_min, out_max));
168
+ const ivec3 pos_l = ivec3 (pos[i * 2 ], pos[i * 2 + 1 ], gpos.z);
169
+ if (all (lessThan (pos_l, out_limits.xyz))) {
170
+ imageStore(t_out, pos_l, op(vec4 (sum[i * 4 ], sum[i * 4 + 1 ], sum[i * 4 + 2 ], sum[i * 4 + 3 ]), out_min, out_max));
174
171
}
175
172
}
176
173
}
0 commit comments