Skip to content

Commit 4b3a083

Browse files
committed
- add more restricts
- clipper now outputs indexed geometry
1 parent c99f623 commit 4b3a083

File tree

6 files changed

+192
-124
lines changed

6 files changed

+192
-124
lines changed

rasterizer.cpp

Lines changed: 126 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
#include <algorithm>
66

7-
inline static void sort(vec4_t &A0, vec4_t &A1, vec4_t &B0, vec4_t &B1 )
7+
inline static void sort(vec4_t& RESTRICT A0, vec4_t& RESTRICT A1, vec4_t& RESTRICT B0, vec4_t& RESTRICT B1)
88
{
99
vec4_t mask = VecCmpLe(B0, B1);
1010
vec4_t sx = VecAdd(A0, A1);
@@ -16,7 +16,7 @@ inline static void sort(vec4_t &A0, vec4_t &A1, vec4_t &B0, vec4_t &B1 )
1616
B1 = VecSub(sy, B0);
1717
}
1818

19-
inline static void ExtractMatrix(const Matrix& m, RESTRICT vec4_t* matrix)
19+
inline static void ExtractMatrix(const Matrix& RESTRICT m, vec4_t* RESTRICT matrix)
2020
{
2121
#define EXTRACT(line) matrix[line*4+0] = VecShuffle( m.r[line], m.r[line], VecShuffleMask(0, 0, 0, 0)); \
2222
matrix[line*4+1] = VecShuffle( m.r[line], m.r[line], VecShuffleMask(1, 1, 1, 1)); \
@@ -42,75 +42,113 @@ __forceinline static vec4_t intersectLineZ(vec4_t a, vec4_t b, vec4_t plane)
4242
}
4343

4444
template< int vertex_component, bool cmp_func >
45-
__forceinline static int clip_triangle(RESTRICT const vec4_t* input, int count, RESTRICT vec4_t* output, vec4_t plane)
45+
__forceinline static uint32_t clip_triangle(vec4_t* RESTRICT input, uint32_t& vertex_count, const uint16_t* RESTRICT indices,
46+
uint32_t index_count, uint16_t* RESTRICT output, vec4_t plane)
4647
{
47-
int vertices = 0;
48-
for ( int i = 0; i < count; ++i )
48+
uint32_t output_indices = 0;
49+
for (uint32_t i = 0; i < index_count / 3; ++i)
4950
{
50-
vec4_t a0 = input[i*3 + 0];
51-
vec4_t b0 = input[i*3 + 1];
52-
vec4_t c0 = input[i*3 + 2];
53-
vec4_t tmp = VecShuffle(a0, b0, VecShuffleMask( vertex_component, 0, vertex_component, 0));
54-
vec4_t w = VecShuffle(tmp, c0, VecShuffleMask( 0, 2, vertex_component, vertex_component));
55-
56-
int mask = cmp_func ? VecMask(VecCmpGt(w, VecMul(VecShuffle(VecShuffle(a0, b0, VecShuffleMask(3, 3, 3, 3)), c0, VecShuffleMask(0, 2, 3, 3)), plane))) & 7 : VecMask(VecCmpLt(w, VecZero())) & 7;
57-
switch ( mask )
51+
uint16_t i0 = *indices++;
52+
uint16_t i1 = *indices++;
53+
uint16_t i2 = *indices++;
54+
vec4_t v0 = input[i0];
55+
vec4_t v1 = input[i1];
56+
vec4_t v2 = input[i2];
57+
vec4_t tmp = VecShuffle(v0, v1, VecShuffleMask(vertex_component, 0, vertex_component, 0));
58+
vec4_t w = VecShuffle(tmp, v2, VecShuffleMask(0, 2, vertex_component, vertex_component));
59+
60+
int mask = cmp_func ? VecMask(VecCmpGt(w, VecMul(VecShuffle(VecShuffle(v0, v1, VecShuffleMask(3, 3, 3, 3)), v2, VecShuffleMask(0, 2, 3, 3)), plane))) & 7
61+
: VecMask(VecCmpLt(w, VecZero())) & 7;
62+
switch (mask)
5863
{
5964
case 7:
6065
break;
6166
case 0:
62-
output[vertices++] = a0;
63-
output[vertices++] = b0;
64-
output[vertices++] = c0;
67+
output[output_indices + 0] = i0;
68+
output[output_indices + 1] = i1;
69+
output[output_indices + 2] = i2;
70+
output_indices += 3;
6571
break;
6672
case 1:
6773
case 2:
6874
case 4:
6975
{
70-
if ( mask & 1 ) { vec4_t t = a0; a0 = b0; b0 = c0; c0 = t; }
71-
else if ( mask & 2 ) { vec4_t t = a0; a0 = c0; c0 = b0; b0 = t; }
72-
73-
vec4_t ba = intersectLineZ<vertex_component, cmp_func>( c0, a0, plane );
74-
vec4_t bb = intersectLineZ<vertex_component, cmp_func>( c0, b0, plane );
75-
output[vertices++] = a0;
76-
output[vertices++] = b0;
77-
output[vertices++] = bb;
78-
output[vertices++] = bb;
79-
output[vertices++] = ba;
80-
output[vertices++] = a0;
76+
#define SHUFFLE(type,d0,d1,d2,s0,s1,s2) {type tv = s0; d0 = s1; d1 = s2; d2 = tv; }
77+
if (mask & 1)
78+
{
79+
SHUFFLE(vec4_t, v0, v1, v2, v0, v1, v2);
80+
SHUFFLE(uint16_t, i0, i1, i2, i0, i1, i2);
81+
}
82+
else if (mask & 2)
83+
{
84+
SHUFFLE(vec4_t, v0, v2, v1, v0, v2, v1);
85+
SHUFFLE(uint16_t, i0, i2, i1, i0, i2, i1);
86+
}
87+
#undef SHUFFLE
88+
89+
uint16_t ba = vertex_count;
90+
input[vertex_count++] = intersectLineZ<vertex_component, cmp_func>(v2, v0, plane);
91+
uint16_t bb = vertex_count;
92+
input[vertex_count++] = intersectLineZ<vertex_component, cmp_func>(v2, v1, plane);
93+
94+
output[output_indices + 0] = i0;
95+
output[output_indices + 1] = i1;
96+
output[output_indices + 2] = bb;
97+
output[output_indices + 3] = bb;
98+
output[output_indices + 4] = ba;
99+
output[output_indices + 5] = i0;
100+
output_indices += 6;
81101
}
82102
break;
83103
default:
84104
{
85-
vec4_t in = ( mask & 1 ) == 0 ? a0 : ( ( mask & 4 ) == 0 ? c0 : b0 );
86-
output[vertices++] = mask & 1 ? intersectLineZ< vertex_component, cmp_func >( a0, in, plane ) : a0;
87-
output[vertices++] = mask & 2 ? intersectLineZ< vertex_component, cmp_func >( b0, in, plane ) : b0;
88-
output[vertices++] = mask & 4 ? intersectLineZ< vertex_component, cmp_func >( c0, in, plane ) : c0;
105+
vec4_t in = (mask & 1) == 0 ? v0 : ((mask & 4) == 0 ? v2 : v1);
106+
if (mask & 1)
107+
{
108+
output[output_indices + 0] = vertex_count;
109+
input[vertex_count++] = intersectLineZ< vertex_component, cmp_func >(v0, in, plane);
110+
}
111+
else
112+
output[output_indices + 0] = i0;
113+
if (mask & 2)
114+
{
115+
output[output_indices + 1] = vertex_count;
116+
input[vertex_count++] = intersectLineZ< vertex_component, cmp_func >(v1, in, plane);
117+
}
118+
else
119+
output[output_indices + 1] = i1;
120+
if (mask & 4)
121+
{
122+
output[output_indices + 2] = vertex_count;
123+
input[vertex_count++] = intersectLineZ< vertex_component, cmp_func >(v2, in, plane);
124+
}
125+
else
126+
output[output_indices + 2] = i2;
127+
output_indices += 3;
89128
}
90129
break;
91130
}
92131
}
93-
assert( vertices < 196 );
94-
return vertices / 3;
132+
return output_indices;
95133
}
96134

97-
__forceinline int static clip_triangle(RESTRICT vec4_t* v, RESTRICT vec4_t* dst)
135+
__forceinline static uint32_t clip_triangles(vec4_t* RESTRICT vertices, uint32_t& vertex_count, const uint16_t* RESTRICT indices, uint32_t index_count,
136+
uint16_t* RESTRICT output_indices)
98137
{
99138
vec4_t g_total_width_v = Vector4(Rasterizer::g_total_width);
100139
vec4_t g_total_height_v = Vector4(Rasterizer::g_total_height);
101140

102141
int count = 4;
103-
vec4_t input_array[196], output_array[196];
104-
count = clip_triangle<1, false>(v, count, input_array, VecZero()); // y < 0
105-
count = clip_triangle<0, false>(input_array, count, output_array, VecZero()); // x < 0
106-
count = clip_triangle<0, true>(output_array, count, input_array, g_total_width_v); // x > 1280
107-
count = clip_triangle<2, false>(input_array, count, output_array, VecZero()); // z < 0
108-
count = clip_triangle<1, true>(output_array, count, dst, g_total_height_v); // y > 720
109-
return count;
142+
uint16_t input_array[1024], output_array[1024];
143+
count = clip_triangle<1, false>(vertices, vertex_count, indices, index_count, input_array, VecZero()); // y < 0
144+
count = clip_triangle<0, false>(vertices, vertex_count, input_array, count, output_array, VecZero()); // x < 0
145+
count = clip_triangle<0, true>(vertices, vertex_count, output_array, count, input_array, g_total_width_v); // x > 1280
146+
count = clip_triangle<2, false>(vertices, vertex_count, input_array, count, output_array, VecZero()); // z < 0
147+
return clip_triangle<1, true>(vertices, vertex_count, output_array, count, output_indices, g_total_height_v); // y > 720
110148
}
111149

112-
__forceinline void Rasterizer::push_4triangles(TrianagleData& data, uint32_t flag, int* bounds_array,
113-
RESTRICT const vec4_t* x, RESTRICT const vec4_t* y, RESTRICT const vec4_t* w, bool select_tiles)
150+
__forceinline void Rasterizer::push_4triangles(TrianagleData& RESTRICT data, uint32_t flag, int* RESTRICT bounds_array,
151+
const vec4_t* RESTRICT x, const vec4_t* RESTRICT y, const vec4_t* RESTRICT w, bool select_tiles)
114152
{
115153
const vec4_t local_fixed_point = Vector4(1<<g_fixed_point_bits);
116154
vec4_t x0 = VecMul(x[0], local_fixed_point);
@@ -121,6 +159,13 @@ __forceinline void Rasterizer::push_4triangles(TrianagleData& data, uint32_t fla
121159
vec4_t y2 = y[2];
122160

123161
uint32_t mask = VecMask(VecCmpLt(VecSub(VecMul(VecSub(x1, x0), VecSub(y2, y0)), VecMul(VecSub(x2, x0), VecSub(y1, y0))), VecZero()));
162+
#if USE_STATS
163+
if (!m_mt)
164+
{
165+
m_triangles_backface += 4 - __builtin_popcount(mask);
166+
m_full_groups += mask == 0 ? 1 : 0;
167+
}
168+
#endif
124169
if (mask == 0)
125170
return;
126171

@@ -207,84 +252,64 @@ __forceinline void Rasterizer::push_4triangles(TrianagleData& data, uint32_t fla
207252
}
208253
}
209254

210-
__forceinline void load_4vertices(vec4_t& x, vec4_t& y, vec4_t& w, const RESTRICT vec4_t* src, const RESTRICT uint16_t* indices, uint32_t base_index, bool use_indices)
255+
__forceinline static void load_4vertices(vec4_t& RESTRICT x, vec4_t& RESTRICT y, vec4_t& RESTRICT w, const vec4_t* RESTRICT src, const uint16_t* RESTRICT indices, uint32_t base_index)
211256
{
212-
#define IDX(num)(use_indices ? indices[base_index + num] : base_index + num)
213-
vec4_t v0_0 = src[IDX(0)];
214-
vec4_t v0_1 = src[IDX(3)];
215-
vec4_t v0_2 = src[IDX(6)];
216-
vec4_t v0_3 = src[IDX(9)];
257+
vec4_t v0_0 = src[indices[base_index + 0]];
258+
vec4_t v0_1 = src[indices[base_index + 3]];
259+
vec4_t v0_2 = src[indices[base_index + 6]];
260+
vec4_t v0_3 = src[indices[base_index + 9]];
217261
vec4_t tmp0 = VecUnpackLo(v0_0, v0_1); // x0_0 x1_0 y0_0 y1_0
218262
vec4_t tmp1 = VecUnpackLo(v0_2, v0_3); // x2_0 x3_0 y2_0 y3_0
219263
vec4_t tmp0_0 = VecUnpackHi(v0_0, v0_1); // z0_0 z1_0 w0_0 w1_0
220264
vec4_t tmp1_0 = VecUnpackHi(v0_2, v0_3); // z2_0 z3_0 w2_0 w3_0
221265
w = VecMoveHL(tmp0_0, tmp1_0); // w0_0 w1_0 w2_0 w3_0
222266
x = VecMul(VecMoveLH(tmp0, tmp1), VecRcp(w));
223267
y = VecMul(VecMoveHL(tmp0, tmp1), VecRcp(w));
224-
#undef IDX
225268
}
226269

227-
__forceinline void Rasterizer::push_triangle_batched(TrianagleData& data,uint32_t flag, const RESTRICT vec4_t* src, int count, const RESTRICT uint16_t* indices,
228-
RESTRICT int* bounds_array, bool select_tiles, bool use_indices)
270+
__forceinline void Rasterizer::push_triangle_batched(TrianagleData& RESTRICT data,uint32_t flag, const vec4_t* RESTRICT src, int count, const uint16_t* RESTRICT indices,
271+
int* RESTRICT bounds_array, bool select_tiles)
229272
{
230273
assert(( (count / 3) & 3 ) == 0);
231274
for ( int i = 0; i < count; i += 12 )
232275
{
233276
vec4_t x[3], y[3], w[3];
234-
load_4vertices(x[0], y[0], w[0], src, indices, i + 0, use_indices);
235-
load_4vertices(x[1], y[1], w[1], src, indices, i + 1, use_indices);
236-
load_4vertices(x[2], y[2], w[2], src, indices, i + 2, use_indices);
277+
load_4vertices(x[0], y[0], w[0], src, indices, i + 0);
278+
load_4vertices(x[1], y[1], w[1], src, indices, i + 1);
279+
load_4vertices(x[2], y[2], w[2], src, indices, i + 2);
237280
push_4triangles(data, flag, bounds_array, x, y, w, select_tiles);
238281
}
239282
}
240283

241-
void Rasterizer::push_object_clipped(ThreadData& thread_data, const uint16_t* indices, int index_count,
242-
const vec4_t* transformed_vertices, int* bounds_array, uint32_t flag, bool select_tiles)
284+
void Rasterizer::push_object_clipped(ThreadData& RESTRICT thread_data, const uint16_t* RESTRICT indices, int index_count,
285+
vec4_t* RESTRICT transformed_vertices, uint32_t vertex_count,
286+
int* RESTRICT bounds_array, uint32_t flag, bool select_tiles)
243287
{
244288
assert(index_count >= 12);
245289

246-
constexpr uint32_t max_triangles_in_object = 1024;
247-
vec4_t clipped_triangles[max_triangles_in_object*3];
248-
249-
uint32_t clipped_triangle_count = 0;
250-
for (int i = 0; i < index_count; i += 12)
251-
{
252-
vec4_t v[12];
253-
254-
v[0] = transformed_vertices[indices[i + 0]];
255-
v[1] = transformed_vertices[indices[i + 1]];
256-
v[2] = transformed_vertices[indices[i + 2]];
290+
uint16_t output_indices[1024];
291+
uint32_t clipped_indices = clip_triangles(transformed_vertices, vertex_count, indices, index_count, output_indices);
292+
assert(clipped_indices < 1024);
257293

258-
v[3] = transformed_vertices[indices[i + 3]];
259-
v[4] = transformed_vertices[indices[i + 4]];
260-
v[5] = transformed_vertices[indices[i + 5]];
261-
262-
v[6] = transformed_vertices[indices[i + 6]];
263-
v[7] = transformed_vertices[indices[i + 7]];
264-
v[8] = transformed_vertices[indices[i + 8]];
265-
266-
v[9] = transformed_vertices[indices[i + 9 ]];
267-
v[10] = transformed_vertices[indices[i + 10 ]];
268-
v[11] = transformed_vertices[indices[i + 11 ]];
269-
270-
clipped_triangle_count += clip_triangle(v, clipped_triangles + clipped_triangle_count * 3);
271-
}
272-
assert(clipped_triangle_count < max_triangles_in_object);
294+
if (clipped_indices == 0)
295+
return;
273296

274-
int tris_to_pad = ( ( clipped_triangle_count + 3 ) & ~3 ) - clipped_triangle_count;
275-
vec4_t& v0 = clipped_triangles[ clipped_triangle_count * 3 - 3 ];
276-
vec4_t& v1 = clipped_triangles[ clipped_triangle_count * 3 - 2 ];
277-
vec4_t& v2 = clipped_triangles[ clipped_triangle_count * 3 - 1 ];
278-
for ( int i = 0; i < tris_to_pad; ++i, ++clipped_triangle_count )
297+
uint32_t clipped_triangle_count = clipped_indices / 3;
298+
uint32_t tris_to_pad = ((clipped_triangle_count + 3) & ~3) - clipped_triangle_count;
299+
uint16_t i0 = output_indices[clipped_indices - 3];
300+
uint16_t i1 = output_indices[clipped_indices - 2];
301+
uint16_t i2 = output_indices[clipped_indices - 1];
302+
for (uint32_t i = 0; i < tris_to_pad; ++i)
279303
{
280-
clipped_triangles[ clipped_triangle_count*3 + 0 ] = v0;
281-
clipped_triangles[ clipped_triangle_count*3 + 1 ] = v1;
282-
clipped_triangles[ clipped_triangle_count*3 + 2 ] = v2;
304+
output_indices[clipped_indices + 0] = i0;
305+
output_indices[clipped_indices + 1] = i1;
306+
output_indices[clipped_indices + 2] = i2;
307+
clipped_indices += 3;
283308
}
284-
push_triangle_batched(thread_data.data, flag, clipped_triangles, clipped_triangle_count*3, 0, bounds_array, select_tiles, false);
309+
push_triangle_batched(thread_data.data, flag, transformed_vertices, clipped_indices, output_indices, bounds_array, select_tiles);
285310
}
286311

287-
bool Rasterizer::occlude_object(const RESTRICT vec4_t* m, vec4_t v_min, vec4_t v_max, RESTRICT int* bounds_array)
312+
bool Rasterizer::occlude_object(const vec4_t* RESTRICT m, vec4_t v_min, vec4_t v_max, int* RESTRICT bounds_array)
288313
{
289314
vec4_t g_total_width_v = Vector4(g_total_width);
290315
vec4_t g_total_height_v = Vector4(g_total_height);
@@ -393,7 +418,7 @@ __forceinline vec4_t Rasterizer::get_tile_bounds(vec4_t v_min, vec4_t v_max)
393418
return VecMax(VecMin(tile_bounds, m_tile_bounds), VecZero());
394419
}
395420

396-
void Rasterizer::flush_thread_data(ThreadData& thread_data)
421+
void Rasterizer::flush_thread_data(ThreadData& RESTRICT thread_data)
397422
{
398423
assert(m_mt);
399424

@@ -423,7 +448,7 @@ void Rasterizer::flush_thread_data(ThreadData& thread_data)
423448
thread_data.clear();
424449
}
425450

426-
void Rasterizer::push_objects(const RESTRICT Object* objects, uint32_t object_count, uint32_t thread_index)
451+
void Rasterizer::push_objects(const Object* RESTRICT objects, uint32_t object_count, uint32_t thread_index)
427452
{
428453
ThreadData & thread_data = m_mt ? m_thread_data[thread_index] : m_data;
429454
if (m_mt)
@@ -494,9 +519,9 @@ void Rasterizer::push_objects(const RESTRICT Object* objects, uint32_t object_co
494519

495520
bool select_tiles = !(bounds_array[0] + 2 > bounds_array[2] && bounds_array[1] + 2 > bounds_array[3]);
496521
if (inside)
497-
push_triangle_batched(thread_data.data, flag, transformed_vertices, obj.index_count, obj.indices, bounds_array, select_tiles, true);
522+
push_triangle_batched(thread_data.data, flag, transformed_vertices, obj.index_count, obj.indices, bounds_array, select_tiles);
498523
else
499-
push_object_clipped(thread_data, obj.indices, obj.index_count, transformed_vertices, bounds_array, flag, select_tiles);
524+
push_object_clipped(thread_data, obj.indices, obj.index_count, transformed_vertices, obj.vertex_count, bounds_array, flag, select_tiles);
500525
}
501526

502527
if (m_mt)
@@ -566,6 +591,8 @@ void Rasterizer::Init(uint32_t num_threads)
566591
m_tiles.push_back(Tile(i, j));
567592
}
568593

594+
m_tile_height_v = Vector4(Tile::g_tile_height);
595+
569596
m_masks.resize(g_width*g_max_masks_per_tile);
570597
for (int tile = 0; tile < g_width; ++tile)
571598
{
@@ -622,6 +649,8 @@ void Rasterizer::begin(const Matrix& m)
622649
m_triangles_drawn_occludee_total = 0;
623650
m_triangles_skipped = 0;
624651
m_triangles_offscreen = 0;
652+
m_triangles_backface = 0;
653+
m_full_groups = 0;
625654
}
626655

627656
void Rasterizer::ThreadData::clear()

0 commit comments

Comments
 (0)