4
4
5
5
#include < algorithm>
6
6
7
- inline static void sort (vec4_t & A0, vec4_t & A1, vec4_t & B0, vec4_t &B1 )
7
+ inline static void sort (vec4_t & RESTRICT A0, vec4_t & RESTRICT A1, vec4_t & RESTRICT B0, vec4_t & RESTRICT B1 )
8
8
{
9
9
vec4_t mask = VecCmpLe (B0, B1);
10
10
vec4_t sx = VecAdd (A0, A1);
@@ -16,7 +16,7 @@ inline static void sort(vec4_t &A0, vec4_t &A1, vec4_t &B0, vec4_t &B1 )
16
16
B1 = VecSub (sy, B0);
17
17
}
18
18
19
- inline static void ExtractMatrix (const Matrix& m, RESTRICT vec4_t * matrix)
19
+ inline static void ExtractMatrix (const Matrix& RESTRICT m, vec4_t * RESTRICT matrix)
20
20
{
21
21
#define EXTRACT (line ) matrix[line*4 +0 ] = VecShuffle( m.r[line], m.r[line], VecShuffleMask(0 , 0 , 0 , 0 )); \
22
22
matrix[line*4 +1 ] = VecShuffle ( m.r [line], m.r [line], VecShuffleMask (1 , 1 , 1 , 1 )); \
@@ -42,75 +42,113 @@ __forceinline static vec4_t intersectLineZ(vec4_t a, vec4_t b, vec4_t plane)
42
42
}
43
43
44
44
template < int vertex_component, bool cmp_func >
45
- __forceinline static int clip_triangle (RESTRICT const vec4_t * input, int count, RESTRICT vec4_t * output, vec4_t plane)
45
+ __forceinline static uint32_t clip_triangle (vec4_t * RESTRICT input, uint32_t & vertex_count, const uint16_t * RESTRICT indices,
46
+ uint32_t index_count, uint16_t * RESTRICT output, vec4_t plane)
46
47
{
47
- int vertices = 0 ;
48
- for ( int i = 0 ; i < count ; ++i )
48
+ uint32_t output_indices = 0 ;
49
+ for (uint32_t i = 0 ; i < index_count / 3 ; ++i)
49
50
{
50
- vec4_t a0 = input[i*3 + 0 ];
51
- vec4_t b0 = input[i*3 + 1 ];
52
- vec4_t c0 = input[i*3 + 2 ];
53
- vec4_t tmp = VecShuffle (a0, b0, VecShuffleMask ( vertex_component, 0 , vertex_component, 0 ));
54
- vec4_t w = VecShuffle (tmp, c0, VecShuffleMask ( 0 , 2 , vertex_component, vertex_component));
55
-
56
- int mask = cmp_func ? VecMask (VecCmpGt (w, VecMul (VecShuffle (VecShuffle (a0, b0, VecShuffleMask (3 , 3 , 3 , 3 )), c0, VecShuffleMask (0 , 2 , 3 , 3 )), plane))) & 7 : VecMask (VecCmpLt (w, VecZero ())) & 7 ;
57
- switch ( mask )
51
+ uint16_t i0 = *indices++;
52
+ uint16_t i1 = *indices++;
53
+ uint16_t i2 = *indices++;
54
+ vec4_t v0 = input[i0];
55
+ vec4_t v1 = input[i1];
56
+ vec4_t v2 = input[i2];
57
+ vec4_t tmp = VecShuffle (v0, v1, VecShuffleMask (vertex_component, 0 , vertex_component, 0 ));
58
+ vec4_t w = VecShuffle (tmp, v2, VecShuffleMask (0 , 2 , vertex_component, vertex_component));
59
+
60
+ int mask = cmp_func ? VecMask (VecCmpGt (w, VecMul (VecShuffle (VecShuffle (v0, v1, VecShuffleMask (3 , 3 , 3 , 3 )), v2, VecShuffleMask (0 , 2 , 3 , 3 )), plane))) & 7
61
+ : VecMask (VecCmpLt (w, VecZero ())) & 7 ;
62
+ switch (mask)
58
63
{
59
64
case 7 :
60
65
break ;
61
66
case 0 :
62
- output[vertices++] = a0;
63
- output[vertices++] = b0;
64
- output[vertices++] = c0;
67
+ output[output_indices + 0 ] = i0;
68
+ output[output_indices + 1 ] = i1;
69
+ output[output_indices + 2 ] = i2;
70
+ output_indices += 3 ;
65
71
break ;
66
72
case 1 :
67
73
case 2 :
68
74
case 4 :
69
75
{
70
- if ( mask & 1 ) { vec4_t t = a0; a0 = b0; b0 = c0; c0 = t; }
71
- else if ( mask & 2 ) { vec4_t t = a0; a0 = c0; c0 = b0; b0 = t; }
72
-
73
- vec4_t ba = intersectLineZ<vertex_component, cmp_func>( c0, a0, plane );
74
- vec4_t bb = intersectLineZ<vertex_component, cmp_func>( c0, b0, plane );
75
- output[vertices++] = a0;
76
- output[vertices++] = b0;
77
- output[vertices++] = bb;
78
- output[vertices++] = bb;
79
- output[vertices++] = ba;
80
- output[vertices++] = a0;
76
+ #define SHUFFLE (type,d0,d1,d2,s0,s1,s2 ) {type tv = s0; d0 = s1; d1 = s2; d2 = tv; }
77
+ if (mask & 1 )
78
+ {
79
+ SHUFFLE (vec4_t , v0, v1, v2, v0, v1, v2);
80
+ SHUFFLE (uint16_t , i0, i1, i2, i0, i1, i2);
81
+ }
82
+ else if (mask & 2 )
83
+ {
84
+ SHUFFLE (vec4_t , v0, v2, v1, v0, v2, v1);
85
+ SHUFFLE (uint16_t , i0, i2, i1, i0, i2, i1);
86
+ }
87
+ #undef SHUFFLE
88
+
89
+ uint16_t ba = vertex_count;
90
+ input[vertex_count++] = intersectLineZ<vertex_component, cmp_func>(v2, v0, plane);
91
+ uint16_t bb = vertex_count;
92
+ input[vertex_count++] = intersectLineZ<vertex_component, cmp_func>(v2, v1, plane);
93
+
94
+ output[output_indices + 0 ] = i0;
95
+ output[output_indices + 1 ] = i1;
96
+ output[output_indices + 2 ] = bb;
97
+ output[output_indices + 3 ] = bb;
98
+ output[output_indices + 4 ] = ba;
99
+ output[output_indices + 5 ] = i0;
100
+ output_indices += 6 ;
81
101
}
82
102
break ;
83
103
default :
84
104
{
85
- vec4_t in = ( mask & 1 ) == 0 ? a0 : ( ( mask & 4 ) == 0 ? c0 : b0 );
86
- output[vertices++] = mask & 1 ? intersectLineZ< vertex_component, cmp_func >( a0, in, plane ) : a0;
87
- output[vertices++] = mask & 2 ? intersectLineZ< vertex_component, cmp_func >( b0, in, plane ) : b0;
88
- output[vertices++] = mask & 4 ? intersectLineZ< vertex_component, cmp_func >( c0, in, plane ) : c0;
105
+ vec4_t in = (mask & 1 ) == 0 ? v0 : ((mask & 4 ) == 0 ? v2 : v1);
106
+ if (mask & 1 )
107
+ {
108
+ output[output_indices + 0 ] = vertex_count;
109
+ input[vertex_count++] = intersectLineZ< vertex_component, cmp_func >(v0, in, plane);
110
+ }
111
+ else
112
+ output[output_indices + 0 ] = i0;
113
+ if (mask & 2 )
114
+ {
115
+ output[output_indices + 1 ] = vertex_count;
116
+ input[vertex_count++] = intersectLineZ< vertex_component, cmp_func >(v1, in, plane);
117
+ }
118
+ else
119
+ output[output_indices + 1 ] = i1;
120
+ if (mask & 4 )
121
+ {
122
+ output[output_indices + 2 ] = vertex_count;
123
+ input[vertex_count++] = intersectLineZ< vertex_component, cmp_func >(v2, in, plane);
124
+ }
125
+ else
126
+ output[output_indices + 2 ] = i2;
127
+ output_indices += 3 ;
89
128
}
90
129
break ;
91
130
}
92
131
}
93
- assert ( vertices < 196 );
94
- return vertices / 3 ;
132
+ return output_indices;
95
133
}
96
134
97
- __forceinline int static clip_triangle (RESTRICT vec4_t * v, RESTRICT vec4_t * dst)
135
+ __forceinline static uint32_t clip_triangles (vec4_t * RESTRICT vertices, uint32_t & vertex_count, const uint16_t * RESTRICT indices, uint32_t index_count,
136
+ uint16_t * RESTRICT output_indices)
98
137
{
99
138
vec4_t g_total_width_v = Vector4 (Rasterizer::g_total_width);
100
139
vec4_t g_total_height_v = Vector4 (Rasterizer::g_total_height);
101
140
102
141
int count = 4 ;
103
- vec4_t input_array[196 ], output_array[196 ];
104
- count = clip_triangle<1 , false >(v, count, input_array, VecZero ()); // y < 0
105
- count = clip_triangle<0 , false >(input_array, count, output_array, VecZero ()); // x < 0
106
- count = clip_triangle<0 , true >(output_array, count, input_array, g_total_width_v); // x > 1280
107
- count = clip_triangle<2 , false >(input_array, count, output_array, VecZero ()); // z < 0
108
- count = clip_triangle<1 , true >(output_array, count, dst, g_total_height_v); // y > 720
109
- return count;
142
+ uint16_t input_array[1024 ], output_array[1024 ];
143
+ count = clip_triangle<1 , false >(vertices, vertex_count, indices, index_count, input_array, VecZero ()); // y < 0
144
+ count = clip_triangle<0 , false >(vertices, vertex_count, input_array, count, output_array, VecZero ()); // x < 0
145
+ count = clip_triangle<0 , true >(vertices, vertex_count, output_array, count, input_array, g_total_width_v); // x > 1280
146
+ count = clip_triangle<2 , false >(vertices, vertex_count, input_array, count, output_array, VecZero ()); // z < 0
147
+ return clip_triangle<1 , true >(vertices, vertex_count, output_array, count, output_indices, g_total_height_v); // y > 720
110
148
}
111
149
112
- __forceinline void Rasterizer::push_4triangles (TrianagleData& data, uint32_t flag, int * bounds_array,
113
- RESTRICT const vec4_t * x, RESTRICT const vec4_t * y, RESTRICT const vec4_t * w, bool select_tiles)
150
+ __forceinline void Rasterizer::push_4triangles (TrianagleData& RESTRICT data, uint32_t flag, int * RESTRICT bounds_array,
151
+ const vec4_t * RESTRICT x, const vec4_t * RESTRICT y, const vec4_t * RESTRICT w, bool select_tiles)
114
152
{
115
153
const vec4_t local_fixed_point = Vector4 (1 <<g_fixed_point_bits);
116
154
vec4_t x0 = VecMul (x[0 ], local_fixed_point);
@@ -121,6 +159,13 @@ __forceinline void Rasterizer::push_4triangles(TrianagleData& data, uint32_t fla
121
159
vec4_t y2 = y[2 ];
122
160
123
161
uint32_t mask = VecMask (VecCmpLt (VecSub (VecMul (VecSub (x1, x0), VecSub (y2, y0)), VecMul (VecSub (x2, x0), VecSub (y1, y0))), VecZero ()));
162
+ #if USE_STATS
163
+ if (!m_mt)
164
+ {
165
+ m_triangles_backface += 4 - __builtin_popcount (mask);
166
+ m_full_groups += mask == 0 ? 1 : 0 ;
167
+ }
168
+ #endif
124
169
if (mask == 0 )
125
170
return ;
126
171
@@ -207,84 +252,64 @@ __forceinline void Rasterizer::push_4triangles(TrianagleData& data, uint32_t fla
207
252
}
208
253
}
209
254
210
- __forceinline void load_4vertices (vec4_t & x, vec4_t & y, vec4_t & w, const RESTRICT vec4_t * src, const RESTRICT uint16_t * indices, uint32_t base_index, bool use_indices )
255
+ __forceinline static void load_4vertices (vec4_t & RESTRICT x, vec4_t & RESTRICT y, vec4_t & RESTRICT w, const vec4_t * RESTRICT src, const uint16_t * RESTRICT indices, uint32_t base_index)
211
256
{
212
- #define IDX (num )(use_indices ? indices[base_index + num] : base_index + num)
213
- vec4_t v0_0 = src[IDX (0 )];
214
- vec4_t v0_1 = src[IDX (3 )];
215
- vec4_t v0_2 = src[IDX (6 )];
216
- vec4_t v0_3 = src[IDX (9 )];
257
+ vec4_t v0_0 = src[indices[base_index + 0 ]];
258
+ vec4_t v0_1 = src[indices[base_index + 3 ]];
259
+ vec4_t v0_2 = src[indices[base_index + 6 ]];
260
+ vec4_t v0_3 = src[indices[base_index + 9 ]];
217
261
vec4_t tmp0 = VecUnpackLo (v0_0, v0_1); // x0_0 x1_0 y0_0 y1_0
218
262
vec4_t tmp1 = VecUnpackLo (v0_2, v0_3); // x2_0 x3_0 y2_0 y3_0
219
263
vec4_t tmp0_0 = VecUnpackHi (v0_0, v0_1); // z0_0 z1_0 w0_0 w1_0
220
264
vec4_t tmp1_0 = VecUnpackHi (v0_2, v0_3); // z2_0 z3_0 w2_0 w3_0
221
265
w = VecMoveHL (tmp0_0, tmp1_0); // w0_0 w1_0 w2_0 w3_0
222
266
x = VecMul (VecMoveLH (tmp0, tmp1), VecRcp (w));
223
267
y = VecMul (VecMoveHL (tmp0, tmp1), VecRcp (w));
224
- #undef IDX
225
268
}
226
269
227
- __forceinline void Rasterizer::push_triangle_batched (TrianagleData& data,uint32_t flag, const RESTRICT vec4_t * src, int count, const RESTRICT uint16_t * indices,
228
- RESTRICT int * bounds_array, bool select_tiles, bool use_indices )
270
+ __forceinline void Rasterizer::push_triangle_batched (TrianagleData& RESTRICT data,uint32_t flag, const vec4_t * RESTRICT src, int count, const uint16_t * RESTRICT indices,
271
+ int * RESTRICT bounds_array, bool select_tiles)
229
272
{
230
273
assert (( (count / 3 ) & 3 ) == 0 );
231
274
for ( int i = 0 ; i < count; i += 12 )
232
275
{
233
276
vec4_t x[3 ], y[3 ], w[3 ];
234
- load_4vertices (x[0 ], y[0 ], w[0 ], src, indices, i + 0 , use_indices );
235
- load_4vertices (x[1 ], y[1 ], w[1 ], src, indices, i + 1 , use_indices );
236
- load_4vertices (x[2 ], y[2 ], w[2 ], src, indices, i + 2 , use_indices );
277
+ load_4vertices (x[0 ], y[0 ], w[0 ], src, indices, i + 0 );
278
+ load_4vertices (x[1 ], y[1 ], w[1 ], src, indices, i + 1 );
279
+ load_4vertices (x[2 ], y[2 ], w[2 ], src, indices, i + 2 );
237
280
push_4triangles (data, flag, bounds_array, x, y, w, select_tiles);
238
281
}
239
282
}
240
283
241
- void Rasterizer::push_object_clipped (ThreadData& thread_data, const uint16_t * indices, int index_count,
242
- const vec4_t * transformed_vertices, int * bounds_array, uint32_t flag, bool select_tiles)
284
+ void Rasterizer::push_object_clipped (ThreadData& RESTRICT thread_data, const uint16_t * RESTRICT indices, int index_count,
285
+ vec4_t * RESTRICT transformed_vertices, uint32_t vertex_count,
286
+ int * RESTRICT bounds_array, uint32_t flag, bool select_tiles)
243
287
{
244
288
assert (index_count >= 12 );
245
289
246
- constexpr uint32_t max_triangles_in_object = 1024 ;
247
- vec4_t clipped_triangles[max_triangles_in_object*3 ];
248
-
249
- uint32_t clipped_triangle_count = 0 ;
250
- for (int i = 0 ; i < index_count; i += 12 )
251
- {
252
- vec4_t v[12 ];
253
-
254
- v[0 ] = transformed_vertices[indices[i + 0 ]];
255
- v[1 ] = transformed_vertices[indices[i + 1 ]];
256
- v[2 ] = transformed_vertices[indices[i + 2 ]];
290
+ uint16_t output_indices[1024 ];
291
+ uint32_t clipped_indices = clip_triangles (transformed_vertices, vertex_count, indices, index_count, output_indices);
292
+ assert (clipped_indices < 1024 );
257
293
258
- v[3 ] = transformed_vertices[indices[i + 3 ]];
259
- v[4 ] = transformed_vertices[indices[i + 4 ]];
260
- v[5 ] = transformed_vertices[indices[i + 5 ]];
261
-
262
- v[6 ] = transformed_vertices[indices[i + 6 ]];
263
- v[7 ] = transformed_vertices[indices[i + 7 ]];
264
- v[8 ] = transformed_vertices[indices[i + 8 ]];
265
-
266
- v[9 ] = transformed_vertices[indices[i + 9 ]];
267
- v[10 ] = transformed_vertices[indices[i + 10 ]];
268
- v[11 ] = transformed_vertices[indices[i + 11 ]];
269
-
270
- clipped_triangle_count += clip_triangle (v, clipped_triangles + clipped_triangle_count * 3 );
271
- }
272
- assert (clipped_triangle_count < max_triangles_in_object);
294
+ if (clipped_indices == 0 )
295
+ return ;
273
296
274
- int tris_to_pad = ( ( clipped_triangle_count + 3 ) & ~3 ) - clipped_triangle_count;
275
- vec4_t & v0 = clipped_triangles[ clipped_triangle_count * 3 - 3 ];
276
- vec4_t & v1 = clipped_triangles[ clipped_triangle_count * 3 - 2 ];
277
- vec4_t & v2 = clipped_triangles[ clipped_triangle_count * 3 - 1 ];
278
- for ( int i = 0 ; i < tris_to_pad; ++i, ++clipped_triangle_count )
297
+ uint32_t clipped_triangle_count = clipped_indices / 3 ;
298
+ uint32_t tris_to_pad = ((clipped_triangle_count + 3 ) & ~3 ) - clipped_triangle_count;
299
+ uint16_t i0 = output_indices[clipped_indices - 3 ];
300
+ uint16_t i1 = output_indices[clipped_indices - 2 ];
301
+ uint16_t i2 = output_indices[clipped_indices - 1 ];
302
+ for (uint32_t i = 0 ; i < tris_to_pad; ++i)
279
303
{
280
- clipped_triangles[ clipped_triangle_count*3 + 0 ] = v0;
281
- clipped_triangles[ clipped_triangle_count*3 + 1 ] = v1;
282
- clipped_triangles[ clipped_triangle_count*3 + 2 ] = v2;
304
+ output_indices[clipped_indices + 0 ] = i0;
305
+ output_indices[clipped_indices + 1 ] = i1;
306
+ output_indices[clipped_indices + 2 ] = i2;
307
+ clipped_indices += 3 ;
283
308
}
284
- push_triangle_batched (thread_data.data , flag, clipped_triangles, clipped_triangle_count* 3 , 0 , bounds_array, select_tiles, false );
309
+ push_triangle_batched (thread_data.data , flag, transformed_vertices, clipped_indices, output_indices , bounds_array, select_tiles);
285
310
}
286
311
287
- bool Rasterizer::occlude_object (const RESTRICT vec4_t * m, vec4_t v_min, vec4_t v_max, RESTRICT int * bounds_array)
312
+ bool Rasterizer::occlude_object (const vec4_t * RESTRICT m, vec4_t v_min, vec4_t v_max, int * RESTRICT bounds_array)
288
313
{
289
314
vec4_t g_total_width_v = Vector4 (g_total_width);
290
315
vec4_t g_total_height_v = Vector4 (g_total_height);
@@ -393,7 +418,7 @@ __forceinline vec4_t Rasterizer::get_tile_bounds(vec4_t v_min, vec4_t v_max)
393
418
return VecMax (VecMin (tile_bounds, m_tile_bounds), VecZero ());
394
419
}
395
420
396
- void Rasterizer::flush_thread_data (ThreadData& thread_data)
421
+ void Rasterizer::flush_thread_data (ThreadData& RESTRICT thread_data)
397
422
{
398
423
assert (m_mt);
399
424
@@ -423,7 +448,7 @@ void Rasterizer::flush_thread_data(ThreadData& thread_data)
423
448
thread_data.clear ();
424
449
}
425
450
426
- void Rasterizer::push_objects (const RESTRICT Object* objects, uint32_t object_count, uint32_t thread_index)
451
+ void Rasterizer::push_objects (const Object* RESTRICT objects, uint32_t object_count, uint32_t thread_index)
427
452
{
428
453
ThreadData & thread_data = m_mt ? m_thread_data[thread_index] : m_data;
429
454
if (m_mt)
@@ -494,9 +519,9 @@ void Rasterizer::push_objects(const RESTRICT Object* objects, uint32_t object_co
494
519
495
520
bool select_tiles = !(bounds_array[0 ] + 2 > bounds_array[2 ] && bounds_array[1 ] + 2 > bounds_array[3 ]);
496
521
if (inside)
497
- push_triangle_batched (thread_data.data , flag, transformed_vertices, obj.index_count , obj.indices , bounds_array, select_tiles, true );
522
+ push_triangle_batched (thread_data.data , flag, transformed_vertices, obj.index_count , obj.indices , bounds_array, select_tiles);
498
523
else
499
- push_object_clipped (thread_data, obj.indices , obj.index_count , transformed_vertices, bounds_array, flag, select_tiles);
524
+ push_object_clipped (thread_data, obj.indices , obj.index_count , transformed_vertices, obj. vertex_count , bounds_array, flag, select_tiles);
500
525
}
501
526
502
527
if (m_mt)
@@ -566,6 +591,8 @@ void Rasterizer::Init(uint32_t num_threads)
566
591
m_tiles.push_back (Tile (i, j));
567
592
}
568
593
594
+ m_tile_height_v = Vector4 (Tile::g_tile_height);
595
+
569
596
m_masks.resize (g_width*g_max_masks_per_tile);
570
597
for (int tile = 0 ; tile < g_width; ++tile)
571
598
{
@@ -622,6 +649,8 @@ void Rasterizer::begin(const Matrix& m)
622
649
m_triangles_drawn_occludee_total = 0 ;
623
650
m_triangles_skipped = 0 ;
624
651
m_triangles_offscreen = 0 ;
652
+ m_triangles_backface = 0 ;
653
+ m_full_groups = 0 ;
625
654
}
626
655
627
656
void Rasterizer::ThreadData::clear ()
0 commit comments