@@ -5559,127 +5559,6 @@ void ggml_compute_forward_softcap(
55595559 }
55605560}
55615561
5562- // ggml_compute_forward_softcap_max
5563-
5564- static void ggml_compute_forward_softcap_max_f32 (
5565- const ggml_compute_params * params,
5566- ggml_tensor * dst) {
5567-
5568- const struct ggml_tensor * src0 = dst->src [0 ];
5569- const struct ggml_tensor * src1 = dst->src [1 ];
5570-
5571- assert (ggml_is_contiguous (dst));
5572- assert (ggml_are_same_shape (src0, dst));
5573-
5574- float values[4 ];
5575- memcpy (values, dst->op_params , sizeof (values));
5576-
5577- // memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
5578- // memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
5579-
5580- // TODO: handle transposed/permuted matrices
5581-
5582- const int ith = params->ith ;
5583- const int nth = params->nth ;
5584-
5585- GGML_TENSOR_UNARY_OP_LOCALS
5586-
5587- // const int64_t ne11 = src1 ? src1->ne[1] : 1;
5588-
5589- // TODO: is this supposed to be ceil instead of floor?
5590- // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
5591- const uint32_t n_head = ne02;
5592- const uint32_t n_head_log2 = 1u << (uint32_t ) floor (log2 (n_head));
5593-
5594- const float m0 = powf (2 .0f , -(values[1 ] ) / n_head_log2);
5595- const float m1 = powf (2 .0f , -(values[1 ] / 2 .0f ) / n_head_log2);
5596-
5597- const int nc = src0->ne [0 ];
5598- const int nr = ggml_nrows (src0);
5599-
5600- // rows per thread
5601- const int dr = (nr + nth - 1 )/nth;
5602-
5603- // row range for this thread
5604- const int ir0 = dr*ith;
5605- const int ir1 = MIN (ir0 + dr, nr);
5606-
5607- float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
5608-
5609- const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
5610-
5611- for (int i1 = ir0; i1 < ir1; i1++) {
5612- // ALiBi
5613- const uint32_t h = (i1/ne01)%ne02; // head
5614- const float slope = (values[1 ] > 0 .0f ) ? h < n_head_log2 ? powf (m0, h + 1 ) : powf (m1, 2 *(h - n_head_log2) + 1 ) : 1 .0f ;
5615-
5616- float * sp = (float *)((char *) src0->data + i1*src0->nb [1 ]);
5617- float * dp = (float *)((char *) dst->data + i1*dst->nb [1 ]);
5618-
5619- // broadcast the mask across rows
5620- ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data ) + (i1%ne01)*ne00 : NULL ;
5621- float * mp_f32 = src1 ? (float *)((char *) src1->data ) + (i1%ne01)*ne00 : NULL ;
5622-
5623- ggml_vec_cpy_f32 (nc, wp, sp);
5624- ggml_vec_scale_f32 (nc, wp, values[0 ]);
5625- if (mp_f32) {
5626- if (use_f16) {
5627- for (int i = 0 ; i < nc; ++i) {
5628- wp[i] += slope*GGML_FP16_TO_FP32 (mp_f16[i]);
5629- }
5630- } else {
5631- for (int i = 0 ; i < nc; ++i) {
5632- wp[i] += slope*mp_f32[i];
5633- }
5634- }
5635- }
5636-
5637- ggml_vec_softcap_f32 (nc, wp, values[2 ], values[3 ]);
5638-
5639- #ifndef NDEBUG
5640- for (int i = 0 ; i < nc; ++i) {
5641- // printf("p[%d] = %f\n", i, p[i]);
5642- assert (!isnan (wp[i]));
5643- }
5644- #endif
5645-
5646- float max = -INFINITY;
5647- ggml_vec_max_f32 (nc, &max, wp);
5648-
5649- ggml_float sum = ggml_vec_soft_max_f32 (nc, dp, wp, max);
5650- assert (sum > 0.0 );
5651-
5652- sum = 1.0 /sum;
5653- ggml_vec_scale_f32 (nc, dp, sum);
5654-
5655- #ifndef NDEBUG
5656- for (int i = 0 ; i < nc; ++i) {
5657- assert (!isnan (dp[i]));
5658- assert (!isinf (dp[i]));
5659- }
5660- #endif
5661- }
5662-
5663- }
5664-
5665- void ggml_compute_forward_softcap_max (
5666- const ggml_compute_params * params,
5667- ggml_tensor * dst) {
5668-
5669- const struct ggml_tensor * src0 = dst->src [0 ];
5670-
5671- switch (src0->type ) {
5672- case GGML_TYPE_F32:
5673- {
5674- ggml_compute_forward_softcap_max_f32 (params, dst);
5675- } break ;
5676- default :
5677- {
5678- GGML_ASSERT (false );
5679- }
5680- }
5681- }
5682-
56835562// ggml_compute_forward_set
56845563
56855564static void ggml_compute_forward_set_f32 (
0 commit comments