@@ -1646,6 +1646,7 @@ struct llama_cparams {
16461646    float defrag_thold;
16471647
16481648    bool mul_mat_q;
1649+     bool embeddings;
16491650    bool offload_kqv;
16501651    bool do_pooling;
16511652
@@ -1936,16 +1937,16 @@ struct llama_context {
19361937    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
19371938    int32_t n_eval   = 0; // number of eval calls
19381939
1939-     // decode  output (2-dimensional array: [n_tokens][n_vocab])
1940+     // logits  output (2-dimensional array: [n_tokens][n_vocab])
19401941    std::vector<float> logits;
19411942#ifndef NDEBUG
19421943    // guard against access to unset logits
19431944    std::vector<bool>  logits_valid;
19441945#endif
19451946    bool logits_all = false;
19461947
1947-     // input embedding (1 -dimensional array: [n_embd])
1948-     std::vector<float> embedding ;
1948+     // embeddings output (2 -dimensional array: [n_tokens] [n_embd])
1949+     std::vector<float> embeddings ;
19491950
19501951    // memory buffers used to evaluate the model
19511952    std::vector<uint8_t> buf_compute_meta;
@@ -5987,9 +5988,10 @@ struct llm_build_context {
59875988
59885989        // get input vectors with right size
59895990        const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5990-         struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5991+ 
5992+         struct ggml_tensor * inp_pos  = ggml_view_1d(ctx0, lctx.inp_pos,  n_tokens, 0);
59915993        struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5992-         struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5994+         struct ggml_tensor * inp_cls   = ggml_view_1d(ctx0, lctx.inp_cls,   n_tokens, 0);
59935995
59945996        // construct input embeddings (token, type, position)
59955997        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -7971,17 +7973,17 @@ static int llama_decode_internal(
79717973    ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
79727974
79737975    // the output is always the last tensor in the graph
7974-     struct ggml_tensor * res         = gf->nodes[gf->n_nodes - 1];
7975-     struct ggml_tensor * embeddings  = gf->nodes[gf->n_nodes - 2];
7976+     struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
7977+     struct ggml_tensor * embd  = gf->nodes[gf->n_nodes - 2];
79767978
79777979    if (strcmp(res->name, "result_output") == 0) {
79787980        // the embeddings could be the second to last tensor, or the third to last tensor
7979-         if (strcmp(embeddings ->name, "result_norm") != 0) {
7980-             embeddings  = gf->nodes[gf->n_nodes - 3];
7981-             GGML_ASSERT(strcmp(embeddings ->name, "result_norm") == 0);
7981+         if (strcmp(embd ->name, "result_norm") != 0) {
7982+             embd  = gf->nodes[gf->n_nodes - 3];
7983+             GGML_ASSERT(strcmp(embd ->name, "result_norm") == 0);
79827984        }
79837985    } else if (strcmp(res->name, "result_embd") == 0) {
7984-         embeddings  = res;
7986+         embd  = res;
79857987        res = nullptr;
79867988    } else {
79877989        GGML_ASSERT(false);
@@ -8051,46 +8053,53 @@ static int llama_decode_internal(
80518053        logits_out.clear();
80528054#endif
80538055
8054-         ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
8055-         GGML_ASSERT(res_backend != nullptr);
8056+         ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
8057+         GGML_ASSERT(backend_res != nullptr);
8058+ 
80568059        if (batch.logits) {
80578060            logits_out.resize(n_vocab * n_tokens);
80588061            for (uint32_t i = 0; i < n_tokens; i++) {
80598062                if (batch.logits[i] == 0) {
80608063                    continue;
80618064                }
8062-                 ggml_backend_tensor_get_async(res_backend , res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8065+                 ggml_backend_tensor_get_async(backend_res , res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
80638066#ifndef NDEBUG
80648067                logits_valid[i] = true;
80658068#endif
80668069            }
80678070        } else if (lctx.logits_all) {
80688071            logits_out.resize(n_vocab * n_tokens);
8069-             ggml_backend_tensor_get_async(res_backend , res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8072+             ggml_backend_tensor_get_async(backend_res , res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
80708073#ifndef NDEBUG
80718074            std::fill(logits_valid.begin(), logits_valid.end(), true);
80728075#endif
80738076        } else {
80748077            logits_out.resize(n_vocab);
8075-             ggml_backend_tensor_get_async(res_backend , res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8078+             ggml_backend_tensor_get_async(backend_res , res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
80768079#ifndef NDEBUG
80778080            logits_valid[0] = true;
80788081#endif
80798082        }
8080-         ggml_backend_synchronize(res_backend );
8083+         ggml_backend_synchronize(backend_res );
80818084    }
80828085
80838086    // extract embeddings
8084-     if (!lctx.embedding.empty() ) {
8085-         auto & embedding_out  = lctx.embedding ;
8087+     if (cparams.embeddings && embd ) {
8088+         auto & embeddings_out  = lctx.embeddings ;
80868089
8087-         const int64_t embd_pos  = res ? n_embd * (n_tokens-1) : 0 ;
8088-         const int64_t embd_size = res ? n_embd : n_embd * n_tokens ;
8090+         ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd) ;
8091+         GGML_ASSERT(backend_embd != nullptr) ;
80898092
8090-         embedding_out.resize(embd_size);
8091-         ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8092-         ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8093-         ggml_backend_synchronize(embeddings_backend);
8093+         if (batch.logits) {
8094+             embeddings_out.resize(n_embd * n_tokens);
8095+             for (uint32_t i = 0; i < n_tokens; i++) {
8096+                 if (batch.logits[i] == 0) {
8097+                     continue;
8098+                 }
8099+                 ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
8100+             }
8101+         }
8102+         ggml_backend_synchronize(backend_embd);
80948103    }
80958104
80968105    // measure the performance only for the single-token evals
@@ -11634,7 +11643,7 @@ struct llama_context_params llama_context_default_params() {
1163411643        /*.type_v                      =*/ GGML_TYPE_F16,
1163511644        /*.mul_mat_q                   =*/ true,
1163611645        /*.logits_all                  =*/ false,
11637-         /*.embedding                    =*/ false,
11646+         /*.embeddings                   =*/ false,
1163811647        /*.offload_kqv                 =*/ true,
1163911648        /*.do_pooling                  =*/ true,
1164011649    };
@@ -11785,6 +11794,7 @@ struct llama_context * llama_new_context_with_model(
1178511794    cparams.yarn_beta_slow   = params.yarn_beta_slow;
1178611795    cparams.defrag_thold     = params.defrag_thold;
1178711796    cparams.mul_mat_q        = params.mul_mat_q;
11797+     cparams.embeddings       = params.embeddings;
1178811798    cparams.offload_kqv      = params.offload_kqv;
1178911799    cparams.do_pooling       = params.do_pooling;
1179011800
@@ -11933,8 +11943,8 @@ struct llama_context * llama_new_context_with_model(
1193311943        // resized during inference, reserve maximum
1193411944        ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
1193511945
11936-         if (params.embedding ) {
11937-             ctx->embedding.resize (hparams.n_embd);
11946+         if (params.embeddings ) {
11947+             ctx->embeddings.reserve (hparams.n_embd*cparams.n_batch );
1193811948        }
1193911949
1194011950        // graph inputs
@@ -12369,7 +12379,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
1236912379    // assume worst case for logits although only currently set ones are serialized
1237012380    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
1237112381    const size_t s_embedding_size  = sizeof(size_t);
12372-     const size_t s_embedding       = ctx->embedding.size () * sizeof(float);
12382+     const size_t s_embedding       = ctx->embeddings.capacity () * sizeof(float);
1237312383    const size_t s_kv_size         = sizeof(size_t);
1237412384    const size_t s_kv_ntok         = sizeof(int);
1237512385    const size_t s_kv              = ctx->kv_self.total_size();
@@ -12470,12 +12480,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
1247012480
1247112481    // copy embeddings
1247212482    {
12473-         const size_t embedding_size  = ctx->embedding .size();
12483+         const size_t embeddings_size  = ctx->embeddings .size();
1247412484
12475-         data_ctx->write(&embedding_size , sizeof(embedding_size ));
12485+         data_ctx->write(&embeddings_size , sizeof(embeddings_size ));
1247612486
12477-         if (embedding_size ) {
12478-             data_ctx->write(ctx->embedding .data(), embedding_size  * sizeof(float));
12487+         if (embeddings_size ) {
12488+             data_ctx->write(ctx->embeddings .data(), embeddings_size  * sizeof(float));
1247912489        }
1248012490    }
1248112491
@@ -12581,15 +12591,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
1258112591
1258212592    // set embeddings
1258312593    {
12584-         size_t embedding_size;
12594+         size_t embeddings_size;
12595+ 
12596+         memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
1258512597
12586-         memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size );
12598+         GGML_ASSERT(ctx->embeddings.capacity() == embeddings_size );
1258712599
12588-         GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
12600+         if (embeddings_size) {
12601+             ctx->embeddings.resize(embeddings_size);
1258912602
12590-         if (embedding_size) {
12591-             memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
12592-             inp += embedding_size * sizeof(float);
12603+             memcpy(ctx->embeddings.data(), inp, embeddings_size * sizeof(float));
12604+             inp += embeddings_size * sizeof(float);
1259312605        }
1259412606    }
1259512607
@@ -12829,11 +12841,11 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
1282912841}
1283012842
1283112843float * llama_get_embeddings(struct llama_context * ctx) {
12832-     return ctx->embedding .data();
12844+     return ctx->embeddings .data();
1283312845}
1283412846
1283512847float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12836-     return ctx->embedding .data() + i*ctx->model.hparams.n_embd;
12848+     return ctx->embeddings .data() + i*ctx->model.hparams.n_embd;
1283712849}
1283812850
1283912851const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
0 commit comments