Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions vortex-cuda/cuda_kernel_generator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
let bits = <T>::T;
let lanes = T::LANES;
let per_thread_loop_count = lanes / thread_count;
let shared_copy_ncount = 1024 / thread_count;

let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");

Expand All @@ -99,10 +100,18 @@ fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;

output.indent(|output| {
writeln!(output, "__shared__ uint{bits}_t shared_out[1024];")?;

for thread_lane in 0..per_thread_loop_count {
writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, shared_out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
}
Ok(())

writeln!(output, "for (int i = 0; i < {shared_copy_ncount}; i++) {{")?;
output.indent(|output| {
writeln!(output, "auto idx = i * {thread_count} + thread_idx;")?;
writeln!(output, "out[idx] = shared_out[idx];")
})?;
writeln!(output, "}}")
})?;

writeln!(output, "}}")
Expand Down
153 changes: 119 additions & 34 deletions vortex-cuda/kernels/src/bit_unpack_16.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,13 @@ __device__ void _bit_unpack_16_0bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_0bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_0bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_0bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_0bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_0bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -79,8 +84,13 @@ __device__ void _bit_unpack_16_1bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_1bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_1bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_1bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_1bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_1bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -133,8 +143,13 @@ __device__ void _bit_unpack_16_2bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_2bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_2bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_2bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_2bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_2bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -189,8 +204,13 @@ __device__ void _bit_unpack_16_3bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_3bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_3bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_3bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_3bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_3bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -247,8 +267,13 @@ __device__ void _bit_unpack_16_4bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_4bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_4bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_4bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_4bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_4bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -307,8 +332,13 @@ __device__ void _bit_unpack_16_5bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_5bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_5bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_5bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_5bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_5bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -369,8 +399,13 @@ __device__ void _bit_unpack_16_6bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_6bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_6bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_6bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_6bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_6bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -433,8 +468,13 @@ __device__ void _bit_unpack_16_7bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_7bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_7bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_7bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_7bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_7bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -499,8 +539,13 @@ __device__ void _bit_unpack_16_8bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_8bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_8bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_8bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_8bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_8bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -567,8 +612,13 @@ __device__ void _bit_unpack_16_9bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_9bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_9bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_9bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_9bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_9bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -637,8 +687,13 @@ __device__ void _bit_unpack_16_10bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_10bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_10bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_10bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_10bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_10bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -709,8 +764,13 @@ __device__ void _bit_unpack_16_11bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_11bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_11bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_11bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_11bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_11bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -783,8 +843,13 @@ __device__ void _bit_unpack_16_12bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_12bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_12bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_12bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_12bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_12bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -859,8 +924,13 @@ __device__ void _bit_unpack_16_13bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_13bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_13bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_13bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_13bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_13bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -937,8 +1007,13 @@ __device__ void _bit_unpack_16_14bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_14bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_14bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_14bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_14bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_14bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -1017,8 +1092,13 @@ __device__ void _bit_unpack_16_15bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_15bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_15bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_15bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_15bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_15bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down Expand Up @@ -1050,8 +1130,13 @@ __device__ void _bit_unpack_16_16bw_lane(const uint16_t *__restrict in, uint16_t
}

__device__ void _bit_unpack_16_16bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) {
_bit_unpack_16_16bw_lane(in, out, thread_idx * 2 + 0);
_bit_unpack_16_16bw_lane(in, out, thread_idx * 2 + 1);
__shared__ uint16_t shared_out[1024];
_bit_unpack_16_16bw_lane(in, shared_out, thread_idx * 2 + 0);
_bit_unpack_16_16bw_lane(in, shared_out, thread_idx * 2 + 1);
for (int i = 0; i < 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) {
Expand Down
Loading
Loading