|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright the Vortex contributors |
| 3 | + |
| 4 | +// Fused kernel combining FastLanes bitpacking unpack with Frame-of-Reference addition |
| 5 | +// This avoids an intermediate memory write/read by fusing the operations |
| 6 | + |
| 7 | +#include <cuda.h> |
| 8 | +#include <cuda_runtime.h> |
| 9 | +#include <stdint.h> |
| 10 | +#include "fastlanes_common.cuh" |
| 11 | + |
| 12 | + |
| 13 | +__device__ void fls_unpack_6bw_32ow_device(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { |
| 14 | + int i = thread_idx; |
| 15 | + uint32_t src; |
| 16 | + uint32_t tmp; |
| 17 | + |
| 18 | + src = in[i * 1 + 0]; |
| 19 | + tmp = (src >> 0) & MASK(uint32_t, 6); |
| 20 | + out[INDEX(0, (i * 1 + 0))] = tmp; |
| 21 | + tmp = (src >> 6) & MASK(uint32_t, 6); |
| 22 | + out[INDEX(1, (i * 1 + 0))] = tmp; |
| 23 | + tmp = (src >> 12) & MASK(uint32_t, 6); |
| 24 | + out[INDEX(2, (i * 1 + 0))] = tmp; |
| 25 | + tmp = (src >> 18) & MASK(uint32_t, 6); |
| 26 | + out[INDEX(3, (i * 1 + 0))] = tmp; |
| 27 | + tmp = (src >> 24) & MASK(uint32_t, 6); |
| 28 | + out[INDEX(4, (i * 1 + 0))] = tmp; |
| 29 | + tmp = (src >> 30) & MASK(uint32_t, 2); |
| 30 | + src = in[i * 1 + 0 + 32 * 1]; |
| 31 | + tmp |= (src & MASK(uint32_t, 4)) << 2; |
| 32 | + out[INDEX(5, (i * 1 + 0))] = tmp; |
| 33 | + tmp = (src >> 4) & MASK(uint32_t, 6); |
| 34 | + out[INDEX(6, (i * 1 + 0))] = tmp; |
| 35 | + tmp = (src >> 10) & MASK(uint32_t, 6); |
| 36 | + out[INDEX(7, (i * 1 + 0))] = tmp; |
| 37 | + tmp = (src >> 16) & MASK(uint32_t, 6); |
| 38 | + out[INDEX(8, (i * 1 + 0))] = tmp; |
| 39 | + tmp = (src >> 22) & MASK(uint32_t, 6); |
| 40 | + out[INDEX(9, (i * 1 + 0))] = tmp; |
| 41 | + tmp = (src >> 28) & MASK(uint32_t, 4); |
| 42 | + src = in[i * 1 + 0 + 32 * 2]; |
| 43 | + tmp |= (src & MASK(uint32_t, 2)) << 4; |
| 44 | + out[INDEX(10, (i * 1 + 0))] = tmp; |
| 45 | + tmp = (src >> 2) & MASK(uint32_t, 6); |
| 46 | + out[INDEX(11, (i * 1 + 0))] = tmp; |
| 47 | + tmp = (src >> 8) & MASK(uint32_t, 6); |
| 48 | + out[INDEX(12, (i * 1 + 0))] = tmp; |
| 49 | + tmp = (src >> 14) & MASK(uint32_t, 6); |
| 50 | + out[INDEX(13, (i * 1 + 0))] = tmp; |
| 51 | + tmp = (src >> 20) & MASK(uint32_t, 6); |
| 52 | + out[INDEX(14, (i * 1 + 0))] = tmp; |
| 53 | + tmp = (src >> 26) & MASK(uint32_t, 6); |
| 54 | + src = in[i * 1 + 0 + 32 * 3]; |
| 55 | + tmp |= (src & MASK(uint32_t, 0)) << 6; |
| 56 | + out[INDEX(15, (i * 1 + 0))] = tmp; |
| 57 | + tmp = (src >> 0) & MASK(uint32_t, 6); |
| 58 | + out[INDEX(16, (i * 1 + 0))] = tmp; |
| 59 | + tmp = (src >> 6) & MASK(uint32_t, 6); |
| 60 | + out[INDEX(17, (i * 1 + 0))] = tmp; |
| 61 | + tmp = (src >> 12) & MASK(uint32_t, 6); |
| 62 | + out[INDEX(18, (i * 1 + 0))] = tmp; |
| 63 | + tmp = (src >> 18) & MASK(uint32_t, 6); |
| 64 | + out[INDEX(19, (i * 1 + 0))] = tmp; |
| 65 | + tmp = (src >> 24) & MASK(uint32_t, 6); |
| 66 | + out[INDEX(20, (i * 1 + 0))] = tmp; |
| 67 | + tmp = (src >> 30) & MASK(uint32_t, 2); |
| 68 | + src = in[i * 1 + 0 + 32 * 4]; |
| 69 | + tmp |= (src & MASK(uint32_t, 4)) << 2; |
| 70 | + out[INDEX(21, (i * 1 + 0))] = tmp; |
| 71 | + tmp = (src >> 4) & MASK(uint32_t, 6); |
| 72 | + out[INDEX(22, (i * 1 + 0))] = tmp; |
| 73 | + tmp = (src >> 10) & MASK(uint32_t, 6); |
| 74 | + out[INDEX(23, (i * 1 + 0))] = tmp; |
| 75 | + tmp = (src >> 16) & MASK(uint32_t, 6); |
| 76 | + out[INDEX(24, (i * 1 + 0))] = tmp; |
| 77 | + tmp = (src >> 22) & MASK(uint32_t, 6); |
| 78 | + out[INDEX(25, (i * 1 + 0))] = tmp; |
| 79 | + tmp = (src >> 28) & MASK(uint32_t, 4); |
| 80 | + src = in[i * 1 + 0 + 32 * 5]; |
| 81 | + tmp |= (src & MASK(uint32_t, 2)) << 4; |
| 82 | + out[INDEX(26, (i * 1 + 0))] = tmp; |
| 83 | + tmp = (src >> 2) & MASK(uint32_t, 6); |
| 84 | + out[INDEX(27, (i * 1 + 0))] = tmp; |
| 85 | + tmp = (src >> 8) & MASK(uint32_t, 6); |
| 86 | + out[INDEX(28, (i * 1 + 0))] = tmp; |
| 87 | + tmp = (src >> 14) & MASK(uint32_t, 6); |
| 88 | + out[INDEX(29, (i * 1 + 0))] = tmp; |
| 89 | + tmp = (src >> 20) & MASK(uint32_t, 6); |
| 90 | + out[INDEX(30, (i * 1 + 0))] = tmp; |
| 91 | + tmp = (src >> 26) & MASK(uint32_t, 6); |
| 92 | + out[INDEX(31, (i * 1 + 0))] = tmp; |
| 93 | +} |
| 94 | + |
| 95 | +// Device function template (callable from device code) |
| 96 | +template<typename ValueT> |
| 97 | +__device__ __forceinline__ void for_device( |
| 98 | + ValueT *__restrict values_in_out, |
| 99 | + ValueT reference, |
| 100 | + int thread_idx |
| 101 | +) { |
| 102 | + auto i = thread_idx; |
| 103 | + const int thread_ops = blockDim.x; |
| 104 | + |
| 105 | + for (auto j = 0; j < thread_ops; j++) { |
| 106 | + auto idx = INDEX(j, i); |
| 107 | + values_in_out[idx] = values_in_out[idx] + reference; |
| 108 | + } |
| 109 | +} |
| 110 | + |
| 111 | + |
| 112 | +// Fused kernel: bitpack unpack (3bw) + FoR addition in one pass |
| 113 | +// This eliminates the intermediate write-to-memory and read-from-memory |
| 114 | +// by keeping unpacked values in registers/L1 cache and immediately adding the reference |
| 115 | +extern "C" __global__ void fused_bitpack6_for_u32( |
| 116 | + const uint32_t *__restrict packed_in, |
| 117 | + uint32_t *__restrict unpacked_out, |
| 118 | + uint32_t reference |
| 119 | +) { |
| 120 | + int i = threadIdx.x; |
| 121 | + auto in = packed_in + (blockIdx.x * (128 * 6 / sizeof(uint32_t))); |
| 122 | + const uint32_t fl_lane_count = 32; |
| 123 | + auto blockSize = blockDim.x * fl_lane_count; |
| 124 | + auto out = unpacked_out + (blockIdx.x * 1024); |
| 125 | + |
| 126 | + __shared__ uint32_t shared_data[1024]; |
| 127 | + |
| 128 | + fls_unpack_6bw_32ow_device(in, shared_data, i); |
| 129 | + |
| 130 | + for_device(shared_data, reference, i); |
| 131 | + |
| 132 | + for (int i = 0; i < 32; i++) { |
| 133 | + auto idx = i * 32 + threadIdx.x; |
| 134 | + out[idx] = shared_data[idx]; |
| 135 | + } |
| 136 | +} |
0 commit comments