Skip to content

Commit

Permalink
llama : add ability to cancel model loading (ggerganov#4462)
Browse files Browse the repository at this point in the history
* llama : Add ability to cancel model load

Updated llama_progress_callback so that if it returns false, the model
loading is aborted.

* llama : Add test for model load cancellation

* Fix bool return in llama_model_load, remove std::ignore use

* Update llama.cpp

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Fail test if model file is missing

* Revert "Fail test if model file is missing"

This reverts commit 32ebd52.

* Add test-model-load-cancel to Makefile

* Revert "Revert "Fail test if model file is missing""

This reverts commit 2796953.

* Simplify .gitignore for tests, clang-tidy fixes

* Label all ctest tests

* ci : ctest uses -L main

* Attempt at writing ctest_with_model

* ci : get ci/run.sh working with test-model-load-cancel

* ci : restrict .github/workflows/build.yml ctest to -L main

* update requirements.txt

* Disable test-model-load-cancel in make

* Remove venv before creation

* Restructure requirements.txt

Top-level now imports the specific additional requirements for each
python file. Using `pip install -r requirements.txt` will fail if
versions become mismatched in the per-file requirements.

* Make per-python-script requirements work alone

This doesn't break the main requirements.txt.

* Add comment

* Add convert-persimmon-to-gguf.py to new requirements.txt scheme

* Add check-requirements.sh script and GitHub workflow

* Remove shellcheck installation step from workflow

* Add nocleanup special arg

* Fix merge

see: ggerganov#4462 (comment)

* reset to upstream/master

* Redo changes for cancelling model load

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
  • Loading branch information
3 people authored Dec 22, 2023
1 parent afefa31 commit c7e9701
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 15 deletions.
46 changes: 33 additions & 13 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2372,7 +2372,8 @@ struct llama_model_loader {
}
}

void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
// Returns false if cancelled by progress_callback
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
size_t size_data = 0;

for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
Expand Down Expand Up @@ -2404,7 +2405,9 @@ struct llama_model_loader {
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already

if (progress_callback) {
progress_callback((float) size_done / size_data, progress_callback_user_data);
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
return false;
}
}

const size_t offs = file_offset(ggml_get_name(cur));
Expand Down Expand Up @@ -2466,8 +2469,11 @@ struct llama_model_loader {
}

if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
// Even though the model is done loading, we still honor
// cancellation since we need to free allocations.
return progress_callback(1.0f, progress_callback_user_data);
}
return true;
}
};

Expand Down Expand Up @@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
}

static void llm_load_tensors(
// Returns false if cancelled by progress_callback
static bool llm_load_tensors(
llama_model_loader & ml,
llama_model & model,
int n_gpu_layers,
Expand Down Expand Up @@ -3722,16 +3729,20 @@ static void llm_load_tensors(
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
}

ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
return false;
}

model.mapping = std::move(ml.mapping);

// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return true;
}

static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
try {
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);

Expand All @@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con

if (params.vocab_only) {
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
return true;
return 0;
}

llm_load_tensors(
if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data
);
)) {
return -2;
}
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
return false;
return -1;
}

return true;
return 0;
}

//
Expand Down Expand Up @@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file(
LLAMA_LOG_INFO("\n");
}
}
return true;
};
}

if (!llama_model_load(path_model, *model, params)) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
delete model;
return nullptr;
}
Expand Down
6 changes: 4 additions & 2 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ extern "C" {
bool sorted;
} llama_token_data_array;

typedef void (*llama_progress_callback)(float progress, void *ctx);
typedef bool (*llama_progress_callback)(float progress, void *ctx);

// Input data for llama_decode
// A llama_batch object can contain input about one or many sequences
Expand Down Expand Up @@ -180,7 +180,9 @@ extern "C" {
int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

// called with a progress value between 0 and 1, pass NULL to disable
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.
llama_progress_callback progress_callback;

// context pointer passed to the progress callback
Expand Down

0 comments on commit c7e9701

Please sign in to comment.