File tree Expand file tree Collapse file tree 1 file changed +5
-0
lines changed Expand file tree Collapse file tree 1 file changed +5
-0
lines changed Original file line number Diff line number Diff line change @@ -281,6 +281,8 @@ cvt_fp16_to_fp4(
281
281
}
282
282
}
283
283
} else {
284
+ // Load input offsets into registers first, then do the computation.
285
+ // Local array size set to 17 because of register limit.
284
286
uint32_t local_offsets[17 ];
285
287
for (int chunk_start = 0 ; chunk_start < n_experts; chunk_start += 16 ) {
286
288
*reinterpret_cast <int4 *>(local_offsets) =
@@ -350,6 +352,9 @@ cvt_fp16_to_fp4(
350
352
" Vec size is not matched." );
351
353
extern __shared__ uint32_t shared_input_offsets[];
352
354
355
+ // Load input offsets into shared memory.
356
+ // If n_experts is larger than 4, use vectorized int4 to save instructions.
357
+ // If n_experts is smaller than 4, read directly.
353
358
if constexpr (SMALL_NUM_EXPERTS) {
354
359
for (int i = threadIdx .x ; i < n_experts + 1 ; i += blockDim .x ) {
355
360
shared_input_offsets[i] = input_offset_by_experts[i];
You can’t perform that action at this time.
0 commit comments