Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit bb0feab

Browse files
committed
feat: add embedding endpoint
1 parent fbe3082 commit bb0feab

File tree

3 files changed

+37
-6
lines changed

3 files changed

+37
-6
lines changed

config.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"custom_config": {
99
"llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf",
1010
"ctx_len": 2048,
11-
"ngl": 100
11+
"ngl": 100,
12+
"embedding":true
1213
}
1314
}

controllers/llamaCPP.cc

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
#include "nitro_utils.h"
44
#include <chrono>
55
#include <cstring>
6-
#include <thread>
6+
#include <drogon/HttpResponse.h>
77
#include <regex>
8+
#include <thread>
89

910
using namespace inferences;
1011

@@ -198,3 +199,29 @@ void llamaCPP::chatCompletion(
198199
"chat_completions.txt");
199200
callback(resp);
200201
}
202+
203+
void llamaCPP::embedding(
204+
const HttpRequestPtr &req,
205+
std::function<void(const HttpResponsePtr &)> &&callback) {
206+
auto lock = llama.lock();
207+
208+
const auto &jsonBody = req->getJsonObject();
209+
210+
llama.rewind();
211+
llama_reset_timings(llama.ctx);
212+
if (jsonBody->isMember("content") != 0) {
213+
llama.prompt = (*jsonBody)["content"].asString();
214+
} else {
215+
llama.prompt = "";
216+
}
217+
llama.params.n_predict = 0;
218+
llama.loadPrompt();
219+
llama.beginCompletion();
220+
llama.doCompletion();
221+
222+
const json data = format_embedding_response(llama);
223+
auto resp = drogon::HttpResponse::newHttpResponse();
224+
resp->setBody(data.dump());
225+
resp->setContentTypeString("application/json");
226+
callback(resp);
227+
}

controllers/llamaCPP.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,14 +1308,15 @@ static void append_to_generated_text_from_generated_token_probs(
13081308
using namespace drogon;
13091309

13101310
namespace inferences {
1311-
class llamaCPP : public drogon::HttpController<llamaCPP> {
1311+
class llamaCPP : public drogon::HttpController<llamaCPP> {
13121312
public:
13131313
llamaCPP() {
13141314
gpt_params params;
13151315
auto conf = drogon::app().getCustomConfig();
13161316
params.model = conf["llama_model_path"].asString();
13171317
params.n_gpu_layers = conf["ngl"].asInt();
13181318
params.n_ctx = conf["ctx_len"].asInt();
1319+
params.embedding = conf["embedding"].asBool();
13191320
#ifdef GGML_USE_CUBLAS
13201321
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
13211322
params.mul_mat_q = false;
@@ -1345,15 +1346,17 @@ namespace inferences {
13451346
METHOD_LIST_BEGIN
13461347
// list path definitions here;
13471348
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion");
1349+
METHOD_ADD(llamaCPP::embedding,"embedding");
13481350
// PATH_ADD("/llama/chat_completion", Post);
13491351
METHOD_LIST_END
13501352
void chatCompletion(const HttpRequestPtr &req,
13511353
std::function<void(const HttpResponsePtr &)> &&callback);
1354+
void embedding(const HttpRequestPtr &req,
1355+
std::function<void(const HttpResponsePtr &)> &&callback);
13521356

13531357
private:
13541358
llama_server_context llama;
13551359
size_t sent_count = 0;
13561360
size_t sent_token_probs_index = 0;
1357-
};
1358-
}
1359-
;
1361+
};
1362+
}; // namespace inferences

0 commit comments

Comments
 (0)