Skip to content

Commit

Permalink
fix embeddings when using CUDA (#3657)
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren authored Oct 17, 2023
1 parent e1675d1 commit cb33f43
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5903,6 +5903,13 @@ static int llama_decode_internal(

ggml_allocr_alloc_graph(lctx.alloc, gf);

struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];

GGML_ASSERT(strcmp(res->name, "result_output") == 0);
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);


#ifdef GGML_USE_CUBLAS
for (int i = 0; i < gf->n_leafs; i++) {
ggml_tensor * node = gf->leafs[i];
Expand All @@ -5920,6 +5927,12 @@ static int llama_decode_internal(
}

ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);

// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
if (!lctx.embedding.empty()) {
embeddings->backend = GGML_BACKEND_CPU;
}
res->backend = GGML_BACKEND_CPU;
#endif

// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
Expand All @@ -5944,12 +5957,6 @@ static int llama_decode_internal(
n_threads = 1;
}

struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];

GGML_ASSERT(strcmp(res->name, "result_output") == 0);
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);

#if GGML_USE_MPI
const int64_t n_layer = hparams.n_layer;
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
Expand Down

0 comments on commit cb33f43

Please sign in to comment.