@@ -194,6 +194,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
194
194
dst[i] = x[i] + y[i];
195
195
}
196
196
197
+ static __global__ void add_f16_f32_f16 (const half * x, const float * y, half * dst, const int k) {
198
+ const int i = blockDim .x *blockIdx .x + threadIdx .x ;
199
+
200
+ if (i >= k) {
201
+ return ;
202
+ }
203
+ dst[i] = x[i] + __float2half (y[i]);
204
+ }
205
+
197
206
static __global__ void mul_f32 (const float * x, const float * y, float * dst, const int kx, const int ky) {
198
207
const int i = blockDim .x *blockIdx .x + threadIdx .x ;
199
208
@@ -1209,6 +1218,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
1209
1218
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0 , stream>>> (x, y, dst, k);
1210
1219
}
1211
1220
1221
+ static void add_f16_f32_f16_cuda (const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1222
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1 ) / CUDA_ADD_BLOCK_SIZE;
1223
+ add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0 , stream>>> (x, y, dst, k);
1224
+ }
1225
+
1212
1226
static void mul_f32_cuda (const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
1213
1227
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1 ) / CUDA_MUL_BLOCK_SIZE;
1214
1228
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0 , stream>>> (x, y, dst, kx, ky);
@@ -1675,15 +1689,21 @@ inline void ggml_cuda_op_add(
1675
1689
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1676
1690
cudaStream_t & cudaStream_main){
1677
1691
1678
- GGML_ASSERT (src0_ddf_i != nullptr );
1692
+ GGML_ASSERT (src0_ddq_i != nullptr || src0_ddf_i != nullptr );
1679
1693
GGML_ASSERT (src1_ddf_i != nullptr );
1680
1694
GGML_ASSERT (dst_ddf_i != nullptr );
1681
1695
1682
1696
const int64_t ne0 = src0->ne [0 ];
1683
1697
const int64_t i01_diff = i01_high - i01_low;
1684
1698
1685
1699
// compute
1686
- add_f32_cuda (src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1700
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1701
+ add_f32_cuda (src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1702
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1703
+ add_f16_f32_f16_cuda ((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
1704
+ } else {
1705
+ GGML_ASSERT (false );
1706
+ }
1687
1707
CUDA_CHECK (cudaGetLastError ());
1688
1708
1689
1709
(void ) src1;
@@ -2281,8 +2301,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2281
2301
}
2282
2302
2283
2303
void ggml_cuda_add (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2284
- GGML_ASSERT (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2285
- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_add, true , true );
2304
+ // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
2305
+ // Due to flatten_rows == true this does in practice not make a difference however.
2306
+ // Better solution would be nice but right now that would require disproportionate changes.
2307
+ GGML_ASSERT (
2308
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2309
+ src1->type == GGML_TYPE_F32 &&
2310
+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2311
+ ggml_cuda_op (src0, src1, dst, ggml_cuda_op_add, false , true );
2286
2312
}
2287
2313
2288
2314
void ggml_cuda_mul (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2535,7 +2561,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2535
2561
delete extra;
2536
2562
}
2537
2563
2538
- void ggml_cuda_assign_buffers_impl (struct ggml_tensor * tensor, bool scratch) {
2564
+ void ggml_cuda_assign_buffers_impl (struct ggml_tensor * tensor, bool scratch, bool force_inplace ) {
2539
2565
if (scratch && g_scratch_size == 0 ) {
2540
2566
return ;
2541
2567
}
@@ -2544,22 +2570,23 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2544
2570
if (tensor->src0 != nullptr && tensor->src0 ->backend == GGML_BACKEND_CPU) {
2545
2571
const ggml_op src0_op = tensor->src0 ->op ;
2546
2572
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2547
- ggml_cuda_assign_buffers_impl (tensor->src0 , scratch);
2573
+ ggml_cuda_assign_buffers_impl (tensor->src0 , scratch, force_inplace );
2548
2574
}
2549
2575
}
2550
2576
if (tensor->op == GGML_OP_CPY && tensor->src1 ->backend == GGML_BACKEND_CPU) {
2551
- ggml_cuda_assign_buffers_impl (tensor->src1 , scratch);
2577
+ ggml_cuda_assign_buffers_impl (tensor->src1 , scratch, force_inplace );
2552
2578
}
2553
2579
2554
2580
tensor->backend = GGML_BACKEND_GPU;
2555
2581
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
2556
2582
2557
2583
const bool inplace = (tensor->src0 != nullptr && tensor->src0 ->data == tensor->data ) ||
2558
- tensor->op == GGML_OP_VIEW;
2584
+ tensor->op == GGML_OP_VIEW ||
2585
+ force_inplace;
2559
2586
const size_t size = ggml_nbytes (tensor);
2560
2587
2561
2588
CUDA_CHECK (cudaSetDevice (g_main_device));
2562
- if (inplace && tensor->src0 ->backend == GGML_BACKEND_GPU) {
2589
+ if (inplace && ( tensor->src0 ->backend == GGML_BACKEND_GPU || tensor-> src0 -> backend == GGML_BACKEND_GPU_SPLIT) ) {
2563
2590
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0 ->extra ;
2564
2591
char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
2565
2592
size_t offset = 0 ;
@@ -2598,11 +2625,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2598
2625
}
2599
2626
2600
2627
void ggml_cuda_assign_buffers (struct ggml_tensor * tensor) {
2601
- ggml_cuda_assign_buffers_impl (tensor, true );
2628
+ ggml_cuda_assign_buffers_impl (tensor, true , false );
2602
2629
}
2603
2630
2604
2631
void ggml_cuda_assign_buffers_no_scratch (struct ggml_tensor * tensor) {
2605
- ggml_cuda_assign_buffers_impl (tensor, false );
2632
+ ggml_cuda_assign_buffers_impl (tensor, false , false );
2633
+ }
2634
+
2635
+ void ggml_cuda_assign_buffers_force_inplace (struct ggml_tensor * tensor) {
2636
+ ggml_cuda_assign_buffers_impl (tensor, false , true );
2606
2637
}
2607
2638
2608
2639
void ggml_cuda_set_main_device (int main_device) {
0 commit comments