Skip to content

Commit

Permalink
embedding : add EOS token if not present (ggerganov#899)
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov authored and hodlen committed Apr 1, 2024
1 parent 7a74e9a commit 0d23dc4
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,20 @@ int main(int argc, char ** argv) {
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true);
auto inp = ::llama_tokenize(ctx, prompt, true, false);
if (inp.size() > n_batch) {
inp.resize(n_batch);
}
inputs.push_back(inp);
}

// add eos if not present
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_eos(model)) {
inp.push_back(llama_token_eos(model));
}
}

// tokenization stats
if (params.verbose_prompt) {
for (int i = 0; i < (int) inputs.size(); i++) {
Expand Down Expand Up @@ -172,7 +179,7 @@ int main(int argc, char ** argv) {
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
for (int i = 0; i < std::min(16, n_embd); i++) {
fprintf(stdout, "%f ", emb[j * n_embd + i]);
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
}
fprintf(stdout, "\n");
}
Expand Down

0 comments on commit 0d23dc4

Please sign in to comment.