Skip to content

Commit

Permalink
Everything works for storiesICP42Mtok4096.gguf
Browse files Browse the repository at this point in the history
  • Loading branch information
icppWorld committed Sep 14, 2024
1 parent f9717bd commit c2b1b07
Show file tree
Hide file tree
Showing 10 changed files with 360 additions and 185 deletions.
376 changes: 231 additions & 145 deletions README.md

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions native/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ int main() {
mockIC.run_test(
"load_model - " + model, load_model,
candid_in,
"4449444c026c019aa1b2f90c7a6b01bc8a0100010100c800", silent_on_trap,
"4449444c026c04b2ceef2f7a819e846471c897a79907718a88f7f00b716b01bc8a0100010100c800254d6f64656c2073756363657366756c6c79206c6f6164656420696e746f206d656d6f72792e0000", silent_on_trap,
my_principal);

// -----------------------------------------------------------------------------
Expand All @@ -88,11 +88,11 @@ int main() {
// -----------------------------------------------------------------------------
// Start a new chat, which will remove the prompt-cache file if it exists
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"} })' ->
// '(variant { Ok = record { status_code = 200 : nat16; output = "Cache .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/my_cache/prompt.cache not found. Nothing to delete." } })'
// '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/my_cache/prompt.cache"; input = ""; error="" } })'
mockIC.run_test(
"new_chat " + std::to_string(i) + " - " + model, new_chat,
"4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865",
"4449444c026c02b2ceef2f7a819e8464716b01bc8a0100010100c8008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f6d795f63616368652f70726f6d70742e6361636865", silent_on_trap, my_principal);
"4449444c026c04b2ceef2f7a819e846471c897a79907718a88f7f00b716b01bc8a0100010100c8008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f6d795f63616368652f70726f6d70742e63616368650000", silent_on_trap, my_principal);

// -----------------------------------------------------------------------------
// Generate tokens from prompt while saving everything to cache,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@

-r scripts/requirements.txt
-r src/llama_cpp_onicai_fork/requirements.txt
icpp-pro==4.1.0
icpp-pro==4.2.0
ic-py==1.0.1
binaryen.py
2 changes: 1 addition & 1 deletion scripts/optimize_wasm.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def main() -> None:
wasm_path = (build_path / f"{icpp_toml.build_wasm['canister']}.wasm").resolve()

# save the original version
wasm_path_orig = wasm_path.with_stem(wasm_path.stem + "_before_opt").resolve()
wasm_path_orig = wasm_path.with_name(wasm_path.stem + "_before_opt" + wasm_path.suffix).resolve()
shutil.copy(wasm_path, wasm_path_orig)

# optimize the wasm
Expand Down
18 changes: 2 additions & 16 deletions scripts/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,20 +92,6 @@ def main() -> int:
print("Not OK, response is:")
print(response)

# ---------------------------------------------------------------------------
# # A little hacky, but we do something special if we're uploading a model
# if uploading_gguf:
# # Reset the model
# print("--\nResetting the model (gguf) in canister")
# response = canister_instance.reset_model() # pylint: disable=no-member
# if "Ok" in response[0].keys():
# if DEBUG_VERBOSE >= 2:
# print("OK!")
# else:
# print("Something went wrong:")
# print(response)
# sys.exit(1)

# ---------------------------------------------------------------------------
# UPLOAD FILE

Expand Down Expand Up @@ -154,9 +140,9 @@ def main() -> int:
offset += len(chunk)

# ---------------------------------------------------------------------------
# A little hacky, but we do something special if we're uploading a model
# Do something special if we're uploading a llama_cpp_canister model (gguf)
if uploading_gguf:
# load the model inside the canister
# load the model inside the canister into Orthogonal Persisted memory
print(
"--\nInstruct canister to load the model, getting it ready for inference."
)
Expand Down
9 changes: 7 additions & 2 deletions src/llama_cpp.did
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ type InputRecord = record {
args : vec text; // the CLI args of llama.cpp/examples/main, as a list of strings
};

type RunOutputRecord = record { status: StatusCode; output: text };
type RunOutputRecord = record {
status: StatusCode;
input: text;
output: text;
error: text
};
type OutputRecordResult = variant {
Ok : RunOutputRecord;
Err : RunOutputRecord;
Expand Down Expand Up @@ -65,7 +70,7 @@ service : {
ready : () -> (StatusCodeRecordResult) query;

// model endpoints
load_model : (InputRecord) -> (StatusCodeRecordResult);
load_model : (InputRecord) -> (OutputRecordResult);

// up & down load of files
file_download_chunk : (FileDownloadInputRecord) -> (FileDownloadRecordResult) query;
Expand Down
58 changes: 55 additions & 3 deletions src/main_.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,16 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
return formatted;
}

int main_(int argc, char ** argv, std::string principal_id) {
int main_(int argc, char ** argv, std::string principal_id, bool load_model_only, std::string &icpp_error_msg, std::ostringstream &input_ss, std::ostringstream &output_ss) {
gpt_params params;

g_params = &params;

if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
// ICPP-PATCH-START
icpp_error_msg = "Error in gpt_params_print_usage.";
// ICPP-PATCH-END
return 1;
}

Expand Down Expand Up @@ -222,19 +225,33 @@ int main_(int argc, char ** argv, std::string principal_id) {

// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::cout << __func__ << ": icpp-debug 1 " << std::endl;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
std::cout << __func__ << ": icpp-debug 2 " << std::endl;
if (sparams.cfg_scale > 1.f) {
std::cout << __func__ << ": icpp-debug 3 " << std::endl;
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
std::cout << __func__ << ": icpp-debug 4 " << std::endl;
ctx_guidance = llama_new_context_with_model(model, lparams);
std::cout << __func__ << ": icpp-debug 5 " << std::endl;
}
std::cout << __func__ << ": icpp-debug 6 " << std::endl;

if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
// ICPP-PATCH-START
icpp_error_msg = std::format("{}: error: unable to load model)", __func__);
// ICPP-PATCH-END
return 1;
}
// ICPP-PATCH-START
// Skip loading the model if the --model parameter is not provided
}

// And return if we are asked to ONLY load the model
if (load_model_only) {
return 0;
}
// ICPP-PATCH-END

const int n_ctx_train = llama_n_ctx_train(model);
Expand Down Expand Up @@ -270,19 +287,27 @@ int main_(int argc, char ** argv, std::string principal_id) {

if (!path_session.empty()) {
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
std::cout << __func__ << ": icpp-debug A 1 " << std::endl;
if (!file_exists(path_session)) {
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
} else if (file_is_empty(path_session)) {
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
} else {
// The file exists and is not empty
std::cout << __func__ << ": icpp-debug A 2 " << std::endl;
session_tokens.resize(n_ctx);
size_t n_token_count_out = 0;
std::cout << __func__ << ": icpp-debug A 3 " << std::endl;
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
// ICPP-PATCH-START
icpp_error_msg = std::format("{}: error: failed to load session file '{}')", __func__, path_session.c_str());
// ICPP-PATCH-END
return 1;
}
std::cout << __func__ << ": icpp-debug A 4 " << std::endl;
session_tokens.resize(n_token_count_out);
std::cout << __func__ << ": icpp-debug A 5 " << std::endl;
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
}
}
Expand Down Expand Up @@ -343,7 +368,10 @@ int main_(int argc, char ** argv, std::string principal_id) {

if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; // ICPP-PATCH - TODO - HANDLE THIS
// ICPP-PATCH-START
icpp_error_msg = std::format("{}: error: prompt is too long ({} tokens, max {})", __func__, (int) embd_inp.size(), n_ctx - 4);
// ICPP-PATCH-END
return 1;
}

// debug message about similarity of saved session, if applicable
Expand Down Expand Up @@ -535,9 +563,12 @@ int main_(int argc, char ** argv, std::string principal_id) {

std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;
// std::ostringstream output_ss; g_output_ss = &output_ss;
g_output_ss = &output_ss; // ICPP-PATCH: we pass this in via argument,
// so we can return it to canister caller
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode


// the first thing we will do is to output the prompt, so set color accordingly
// console::set_display(console::prompt);
display = params.display_prompt;
Expand Down Expand Up @@ -565,6 +596,9 @@ int main_(int argc, char ** argv, std::string principal_id) {

if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
// ICPP-PATCH-START
icpp_error_msg = std::format("{}: error: failed to eval (-1-)", __func__);
// ICPP-PATCH-END
return 1;
}

Expand Down Expand Up @@ -707,6 +741,9 @@ int main_(int argc, char ** argv, std::string principal_id) {
int n_eval = std::min(input_size - i, params.n_batch);
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
// ICPP-PATCH-START
icpp_error_msg = std::format("{}: error: failed to eval (-2-)", __func__);
// ICPP-PATCH-END
return 1;
}

Expand All @@ -724,6 +761,9 @@ int main_(int argc, char ** argv, std::string principal_id) {

if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
// ICPP-PATCH-START
icpp_error_msg = std::format("{}: error: failed to eval (-3-)", __func__);
// ICPP-PATCH-END
return 1;
}

Expand Down Expand Up @@ -799,6 +839,9 @@ int main_(int argc, char ** argv, std::string principal_id) {
if (embd.size() > 1) {
// Incoming Requested Tokens
input_tokens.push_back(id);
// ICPP-PATCH-START
input_ss << token_str;
// ICPP-PATCH-END
} else {
// Outgoing Generated Tokens
output_tokens.push_back(id);
Expand Down Expand Up @@ -996,6 +1039,15 @@ int main_(int argc, char ** argv, std::string principal_id) {
}
}

// ICPP-PATCH-START
// The last token is not yet stored in session_tokens
if (!embd.empty() && !path_session.empty()) {
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
n_session_consumed = session_tokens.size();
}

// ICPP-PATCH-END

if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
Expand Down
4 changes: 3 additions & 1 deletion src/main_.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

int main_(int argc, char **argv, std::string principal_id);
#include <sstream>

int main_(int argc, char **argv, std::string principal_id, bool load_model_only, std::string &icpp_error_msg, std::ostringstream &input_ss, std::ostringstream &output_ss);
void free_model();
void reset_static_memory();
30 changes: 25 additions & 5 deletions src/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,37 @@ void load_model() {
// Get the data from the wire and prepare arguments for main_
auto [argc, argv, args] = get_args_for_main(ic_api);

// Lets go.
ready_for_inference = true;

// First free the OP memory of a previously loaded model
free_model();

// Call main_, just like it is called in the llama-cli app
main_(argc, argv.data(), principal_id);
std::string icpp_error_msg;
std::ostringstream input_ss;
std::ostringstream output_ss;
bool load_model_only = true;
int result = main_(argc, argv.data(), principal_id, load_model_only, icpp_error_msg, input_ss, output_ss);

// Exit if there was an error
if (result !=0) {
CandidTypeRecord r_out;
r_out.append("status", CandidTypeNat16{Http::StatusCode::InternalServerError}); // 500
r_out.append("input", CandidTypeText{""});
r_out.append("output", CandidTypeText{""});
r_out.append("error", CandidTypeText{icpp_error_msg});
ic_api.to_wire(CandidTypeVariant{"Err", r_out});
return;
}

// If we get this far, everything is Ok and ready to be used
ready_for_inference = true;

CandidTypeRecord status_code_record;
status_code_record.append("status_code",
CandidTypeNat16{Http::StatusCode::OK});
ic_api.to_wire(CandidTypeVariant{"Ok", status_code_record});
CandidTypeRecord r_out;
r_out.append("status", CandidTypeNat16{Http::StatusCode::OK}); // 200
r_out.append("input", CandidTypeText{""});
r_out.append("output", CandidTypeText{"Model succesfully loaded into memory."});
r_out.append("error", CandidTypeText{""});
ic_api.to_wire(CandidTypeVariant{"Ok", r_out});
}
40 changes: 32 additions & 8 deletions src/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "main_.h"
#include "utils.h"
#include "common.h"
#include "http.h"

#include <iostream>
#include <string>
Expand Down Expand Up @@ -85,9 +86,14 @@ void new_chat() {
msg = "Cache file " + path_session + " deleted successfully";
} else {
msg = "Error deleting cache file " + path_session;
ic_api.to_wire(CandidTypeVariant{
"Err", CandidTypeVariant{"Other", CandidTypeText{std::string(__func__) +
": " + msg}}});

// Return output over the wire
CandidTypeRecord r_out;
r_out.append("status", CandidTypeNat16{Http::StatusCode::InternalServerError}); // 500
r_out.append("input", CandidTypeText{""});
r_out.append("output", CandidTypeText{""});
r_out.append("error", CandidTypeText{msg});
ic_api.to_wire(CandidTypeVariant{"Err", r_out});
return;
}
} else {
Expand All @@ -101,8 +107,10 @@ void new_chat() {

// Return output over the wire
CandidTypeRecord r_out;
r_out.append("status", CandidTypeNat16{200});
r_out.append("status", CandidTypeNat16{Http::StatusCode::OK}); // 200
r_out.append("input", CandidTypeText{""});
r_out.append("output", CandidTypeText{msg});
r_out.append("error", CandidTypeText{""});
ic_api.to_wire(CandidTypeVariant{"Ok", r_out});
}

Expand All @@ -115,13 +123,29 @@ void run(IC_API &ic_api) {
auto [argc, argv, args] = get_args_for_main(ic_api);

// Call main_, just like it is called in the llama-cli app
main_(argc, argv.data(), principal_id);
std::string icpp_error_msg;
std::ostringstream input_ss; // input tokens (from prompt or session cache)
std::ostringstream output_ss; // output tokens (generated during this call)
bool load_model_only = false;
int result = main_(argc, argv.data(), principal_id, load_model_only, icpp_error_msg, input_ss, output_ss);

// Exit if there was an error
if (result !=0) {
CandidTypeRecord r_out;
r_out.append("status", CandidTypeNat16{Http::StatusCode::InternalServerError}); // 500
r_out.append("input", CandidTypeText{input_ss.str()});
r_out.append("output", CandidTypeText{output_ss.str()});
r_out.append("error", CandidTypeText{icpp_error_msg});
ic_api.to_wire(CandidTypeVariant{"Err", r_out});
return;
}

// Return output over the wire
CandidTypeRecord r_out;
r_out.append("status", CandidTypeNat16{200}); // TODO: set the status code
r_out.append("output",
CandidTypeText{"TODO: we need to add some output here.... "});
r_out.append("status", CandidTypeNat16{Http::StatusCode::OK}); // 200
r_out.append("input", CandidTypeText{input_ss.str()});
r_out.append("output", CandidTypeText{output_ss.str()});
r_out.append("error", CandidTypeText{""});
ic_api.to_wire(CandidTypeVariant{"Ok", r_out});
}

Expand Down

0 comments on commit c2b1b07

Please sign in to comment.