Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

common: llama_load_model_from_url split support #6192

Merged
merged 15 commits into from
Mar 23, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 153 additions & 43 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
#endif
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
#include <thread>
#include <future>
#endif

#if defined(_MSC_VER)
Expand All @@ -61,7 +64,7 @@
#else
#include <sys/syslimits.h>
#endif
#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
phymbert marked this conversation as resolved.
Show resolved Hide resolved
#endif // LLAMA_USE_CURL

Expand Down Expand Up @@ -1702,27 +1705,27 @@ void llama_batch_add(

#ifdef LLAMA_USE_CURL

struct llama_model * llama_load_model_from_url(
const char * model_url,
const char * path_model,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
fprintf(stderr, "%s: invalid model_url\n", __func__);
return NULL;
static std::string llama_download_hide_password_in_url(const std::string & url) {
phymbert marked this conversation as resolved.
Show resolved Hide resolved
std::size_t protocol_pos = url.find("://");
if (protocol_pos == std::string::npos) {
return url; // Malformed URL
}

// Initialize libcurl globally
auto curl = curl_easy_init();

if (!curl) {
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
return NULL;
std::size_t at_pos = url.find('@', protocol_pos + 3);
if (at_pos == std::string::npos) {
return url; // No password in URL
}

return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
}

static bool llama_download_file(CURL * curl, const char * url, const char * path) {
bool force_download = false;

// Set the URL, allow to follow http redirection
curl_easy_setopt(curl, CURLOPT_URL, model_url);
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);

#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
Expand All @@ -1731,24 +1734,24 @@ struct llama_model * llama_load_model_from_url(

// Check if the file already exists locally
struct stat model_file_info;
auto file_exists = (stat(path_model, &model_file_info) == 0);
auto file_exists = (stat(path, &model_file_info) == 0);

// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
char etag_path[PATH_MAX] = {0};
snprintf(etag_path, sizeof(etag_path), "%s.etag", path);

char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
char last_modified_path[PATH_MAX] = {0};
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);

if (file_exists) {
auto * f_etag = fopen(etag_path, "r");
if (f_etag) {
if (!fgets(etag, sizeof(etag), f_etag)) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
} else {
fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
}
fclose(f_etag);
}
Expand All @@ -1758,7 +1761,7 @@ struct llama_model * llama_load_model_from_url(
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
} else {
fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
last_modified);
}
fclose(f_last_modified);
Expand All @@ -1776,6 +1779,11 @@ struct llama_model * llama_load_model_from_url(
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;

// Convert header field name to lowercase
for(size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
phymbert marked this conversation as resolved.
Show resolved Hide resolved
buffer[i] = tolower(buffer[i]);
}

const char * etag_prefix = "etag: ";
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
Expand All @@ -1798,38 +1806,42 @@ struct llama_model * llama_load_model_from_url(
if (res != CURLE_OK) {
curl_easy_cleanup(curl);
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return NULL;
return false;
}

long http_code = 0;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
file_exists = false;
force_download = true;
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
}

// If the ETag or the Last-Modified headers are different: trigger a new download
if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
bool should_download = !file_exists
|| force_download
|| (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
|| (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
if (should_download) {
char path_temporary[PATH_MAX] = {0};
snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
if (file_exists) {
fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
if (remove(path_model) != 0) {
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
if (remove(path) != 0) {
curl_easy_cleanup(curl);
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
return NULL;
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
return false;
}
}

// Set the output file
auto * outfile = fopen(path_model_temporary, "wb");
auto * outfile = fopen(path_temporary, "wb");
if (!outfile) {
curl_easy_cleanup(curl);
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
return NULL;
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
return false;
}

typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
Expand All @@ -1844,14 +1856,14 @@ struct llama_model * llama_load_model_from_url(
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
Copy link
Collaborator Author

@phymbert phymbert Mar 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here progression of each split will be flushed to stderr concurrently, it can be improved later on, the actual final result is good enough for a first version.


// start the download
fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
model_url, path_model, headers.etag, headers.last_modified);
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
auto res = curl_easy_perform(curl);
if (res != CURLE_OK) {
fclose(outfile);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return NULL;
return false;
}

long http_code = 0;
Expand All @@ -1860,7 +1872,7 @@ struct llama_model * llama_load_model_from_url(
fclose(outfile);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
return NULL;
return false;
}

// Clean up
Expand All @@ -1872,7 +1884,7 @@ struct llama_model * llama_load_model_from_url(
if (etag_file) {
fputs(headers.etag, etag_file);
fclose(etag_file);
fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
}
}

Expand All @@ -1882,20 +1894,118 @@ struct llama_model * llama_load_model_from_url(
if (last_modified_file) {
fputs(headers.last_modified, last_modified_file);
fclose(last_modified_file);
fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
headers.last_modified);
}
}

if (rename(path_model_temporary, path_model) != 0) {
if (rename(path_temporary, path) != 0) {
curl_easy_cleanup(curl);
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
return false;
}
}

return true;
}

struct llama_model * llama_load_model_from_url(
const char * model_url,
const char * path_model,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
fprintf(stderr, "%s: invalid model_url\n", __func__);
return NULL;
}

// Initialize libcurl
auto * curl = curl_easy_init();

if (!curl) {
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
return NULL;
}

if (!curl) {
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
return NULL;
}

if (!llama_download_file(curl, model_url, path_model)) {
return NULL;
}

// check for additional GGUFs split to download
int n_split = 0;
{
struct gguf_init_params gguf_params = {
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
if (!ctx_gguf) {
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
return NULL;
}

auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split >= 0) {
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
}

gguf_free(ctx_gguf);
}

curl_easy_cleanup(curl);

if (n_split > 1) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the current gguf-split implementation, the first file includes tensor data. Probably in future the first split should only contain metadata to trigger the tensor data download in // earlier.

char split_prefix[PATH_MAX] = {0};
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};

// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
fprintf(stderr, "\n%s: unexpected model file name: %s"
" n_split=%d\n", __func__, path_model, n_split);
return NULL;
}

if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
fprintf(stderr, "\n%s: unexpected model url: %s"
" n_split=%d\n", __func__, model_url, n_split);
return NULL;
}
}

// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (int idx = 1; idx < n_split; idx++) {
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);

char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);

auto * curl = curl_easy_init();
bool res = llama_download_file(curl, split_url, split_path);
curl_easy_cleanup(curl);

return res;
}, idx));
}

// Wait for all downloads to complete
for(auto &f : futures_download) {
if(!f.get()) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here if one download fails, other will continue in background. I think it's acceptable for the first version ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think, in worst case user can re-run to re-download the missing part. That reminds me about how docker download multiple layers.

In the future we can allow developer to modify this behavior, we can add a new llama_download_params to specify:

  • stop_immediate_on_fail ==> true to stop if one part fails
  • n_parallel ==> maximum number of downloads can run in parallel
  • force_redownload ==> skip checking etag

phymbert marked this conversation as resolved.
Show resolved Hide resolved
return NULL;
}
}
}

return llama_load_model_from_file(path_model, params);
}

Expand Down
7 changes: 7 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,3 +306,10 @@ struct llama_control_vector_load_info {
// Load control vectors, scale each by strength, and add them together.
// On error, returns {-1, empty}
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);

//
// Split utils
//
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
4 changes: 0 additions & 4 deletions examples/gguf-split/gguf-split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ enum split_operation : uint8_t {
SPLIT_OP_MERGE,
};

static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

struct split_params {
split_operation operation = SPLIT_OP_SPLIT;
int n_split_tensors = 128;
Expand Down
4 changes: 3 additions & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ The project is under active development, and we are [looking for feedback and co
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
Expand Down
18 changes: 17 additions & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2208,7 +2208,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
printf(" model download url (default: %s)\n", params.model_url.c_str());
printf(" model download url (default: unused)\n");
printf(" -hfr REPO, --hf-repo REPO\n");
printf(" Hugging Face model repository (default: unused)\n");
printf(" -hff FILE, --hf-file FILE\n");
printf(" Hugging Face model file (default: unused)\n");
printf(" -a ALIAS, --alias ALIAS\n");
printf(" set an alias for the model, will be added as `model` field in completion response\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
Expand Down Expand Up @@ -2337,6 +2341,18 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break;
}
params.model_url = argv[i];
} else if (arg == "-hfr" || arg == "--hf-repo") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.hf_repo = argv[i];
} else if (arg == "-hff" || arg == "--hf-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.hf_file = argv[i];
} else if (arg == "-a" || arg == "--alias") {
if (++i >= argc) {
invalid_param = true;
Expand Down
Loading
Loading