Skip to content

Commit d6c71e9

Browse files
committed
Add exception for HIP code
1 parent e1afe75 commit d6c71e9

File tree

2 files changed

+16
-11
lines changed

2 files changed

+16
-11
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,6 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
532532
#endif // defined(GGML_USE_HIP)
533533
}
534534

535-
536535
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float v, const float u) {
537536
acc += v*u;
538537
}
@@ -571,16 +570,6 @@ static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v,
571570
#endif // FAST_FP16_AVAILABLE
572571
}
573572

574-
575-
#if defined(GGML_USE_HIP)
576-
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const __hip_bfloat162 v, const __hip_bfloat162 u) {
577-
const float2 tmpv = __bfloat162float2(v);
578-
const float2 tmpu = __bfloat162float2(u);
579-
acc += tmpv.x * tmpu.x;
580-
acc += tmpv.y * tmpu.y;
581-
}
582-
#endif
583-
584573
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
585574
template <int nbytes, int alignment = 0>
586575
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {

ggml/src/ggml-cuda/mmvf.cu

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,21 @@ static __global__ void mul_mat_vec_f(
8888
#endif // FP16_AVAILABLE
8989
}
9090
} else if constexpr (std::is_same_v<T, nv_bfloat16>) {
91+
//TODO: add support for ggml_cuda_mad for hip_bfloat162
92+
#if defined(GGML_USE_HIP)
93+
const int * x2 = (const int *) x;
94+
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
95+
const int tmpx = x2[col2];
96+
#pragma unroll
97+
for (int j = 0; j < ncols_dst; ++j) {
98+
const float2 tmpy = y2[j*stride_col_y2 + col2];
99+
const float tmpx0 = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]);
100+
const float tmpx1 = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]);
101+
ggml_cuda_mad(sumf[j], tmpx0, tmpy.x);
102+
ggml_cuda_mad(sumf[j], tmpx1, tmpy.y);
103+
}
104+
}
105+
#else
91106
const nv_bfloat162 * x2 = (const nv_bfloat162 *) x;
92107
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
93108
const nv_bfloat162 tmpx = x2[col2];
@@ -98,6 +113,7 @@ static __global__ void mul_mat_vec_f(
98113
ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
99114
}
100115
}
116+
#endif
101117
} else {
102118
static_assert(std::is_same_v<T, void>, "unsupported type");
103119
}

0 commit comments

Comments
 (0)