Skip to content

Commit

Permalink
Merge 'origin/master' into hipblas
Browse files Browse the repository at this point in the history
  • Loading branch information
SlyEcho committed Jun 25, 2023
2 parents df7346c + 66a2555 commit 35a6031
Show file tree
Hide file tree
Showing 22 changed files with 452 additions and 211 deletions.
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

**Hot topics:**

- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729

Expand All @@ -29,6 +30,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
<li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
<li><a href="#using-gpt4all">Using GPT4All</a></li>
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
Expand Down Expand Up @@ -543,6 +545,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```

### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)

OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
Expand Down Expand Up @@ -672,12 +681,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
```
GGML_OPENCL_PLATFORM=0
GGML_OPENCL_DEVICE=0
export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
./main (...)
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```

For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.

Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.

### Docker

#### Prerequisites
Expand Down
91 changes: 44 additions & 47 deletions build.zig
Original file line number Diff line number Diff line change
@@ -1,61 +1,58 @@
const std = @import("std");

// Zig Version: 0.11.0-dev.3379+629f0d23b
pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardReleaseOptions();
const want_lto = b.option(bool, "lto", "Want -fLTO");

const lib = b.addStaticLibrary("llama", null);
lib.want_lto = want_lto;
lib.setTarget(target);
lib.setBuildMode(optimize);
const optimize = b.standardOptimizeOption(.{});
const lib = b.addStaticLibrary(.{
.name = "llama",
.target = target,
.optimize = optimize,
});
lib.linkLibC();
lib.linkLibCpp();
lib.addIncludePath(".");
lib.addIncludePath("examples");
lib.addIncludePath("./examples");
lib.addCSourceFiles(&.{
"ggml.c",
}, &.{"-std=c11"});
lib.addCSourceFiles(&.{
"llama.cpp",
}, &.{"-std=c++11"});
lib.install();

const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };

const exe = build_example("main", build_args);
_ = build_example("quantize", build_args);
_ = build_example("perplexity", build_args);
_ = build_example("embedding", build_args);

// create "zig build run" command for ./main

const run_cmd = exe.run();
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
b.installArtifact(lib);

const examples = .{
"main",
"baby-llama",
"embedding",
// "metal",
"perplexity",
"quantize",
"quantize-stats",
"save-load-state",
// "server",
"simple",
"train-text-from-scratch",
};

inline for (examples) |example_name| {
const exe = b.addExecutable(.{
.name = example_name,
.target = target,
.optimize = optimize,
});
exe.addIncludePath(".");
exe.addIncludePath("./examples");
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
b.installArtifact(exe);
const run_cmd = b.addRunArtifact(exe);
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| run_cmd.addArgs(args);
const run_step = b.step("run_" ++ example_name, "Run the app");
run_step.dependOn(&run_cmd.step);
}

const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
}

fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
const b = args.b;
const lib = args.lib;
const want_lto = args.want_lto;

const exe = b.addExecutable(name, null);
exe.want_lto = want_lto;
lib.setTarget(args.target);
lib.setBuildMode(args.optimize);
exe.addIncludePath(".");
exe.addIncludePath("examples");
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
exe.install();

return exe;
}
4 changes: 2 additions & 2 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -998,9 +998,9 @@ def write_vocab(self, vocab: Vocab) -> None:
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
of = OutputFile(fname_out)
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
n_head=1, n_layer=0)
of = OutputFile(fname_out)
of.write_file_header(params)
of.write_file_header(params, file_type=GGMLFileType.AllF32)
of.write_vocab(vocab)
of.fout.close()

Expand Down
22 changes: 15 additions & 7 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
return res;
}

struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();

lparams.n_ctx = params.n_ctx;
Expand All @@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;

llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return std::make_tuple(nullptr, nullptr);
}

llama_context * lctx = llama_new_context_with_model(model, lparams);
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return NULL;
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}

if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(lctx,
int err = llama_model_apply_lora_from_file(model,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return NULL;
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
}

return lctx;
return std::make_tuple(model, lctx);
}

void console_init(console_state & con_st) {
Expand Down
3 changes: 2 additions & 1 deletion examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <random>
#include <thread>
#include <unordered_map>
#include <tuple>

#if !defined (_WIN32)
#include <stdio.h>
Expand Down Expand Up @@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
// Model utils
//

struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);

//
// Console utils
Expand Down
6 changes: 4 additions & 2 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ int main(int argc, char ** argv) {

llama_init_backend();

llama_model * model;
llama_context * ctx;

// load the model
ctx = llama_init_from_gpt_params(params);
if (ctx == NULL) {
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
Expand Down Expand Up @@ -90,6 +91,7 @@ int main(int argc, char ** argv) {

llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);

return 0;
}
8 changes: 6 additions & 2 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,13 @@ int main(int argc, char ** argv) {

llama_init_backend();

llama_model * model;
llama_context * ctx;
g_ctx = &ctx;

// load the model and apply lora adapter, if any
ctx = llama_init_from_gpt_params(params);
if (ctx == NULL) {
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
Expand All @@ -139,6 +140,7 @@ int main(int argc, char ** argv) {

llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);

return 0;
}
Expand All @@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
if (params.export_cgraph) {
llama_eval_export(ctx, "llama.ggml");
llama_free(ctx);
llama_free_model(model);

return 0;
}
Expand Down Expand Up @@ -666,6 +669,7 @@ int main(int argc, char ** argv) {

llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);

return 0;
}
6 changes: 4 additions & 2 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,12 @@ int main(int argc, char ** argv) {

llama_init_backend();

llama_model * model;
llama_context * ctx;

// load the model and apply lora adapter, if any
ctx = llama_init_from_gpt_params(params);
if (ctx == NULL) {
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
Expand All @@ -169,6 +170,7 @@ int main(int argc, char ** argv) {

llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);

return 0;
}
15 changes: 13 additions & 2 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "Loading model\n");

const int64_t t_main_start_us = ggml_time_us();
llama_model * model;
llama_context * ctx;

{
Expand All @@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
lparams.f16_kv = false;
lparams.use_mlock = false;

ctx = llama_init_from_file(params.model.c_str(), lparams);
model = llama_load_model_from_file(params.model.c_str(), lparams);

if (ctx == NULL) {
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}

ctx = llama_new_context_with_model(model, lparams);

if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return 1;
}
}

const auto &tensors = llama_internal_get_tensor_map(ctx);
Expand All @@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
llama_free(ctx);
llama_free_model(model);
return 1;
}
included_layers++;
Expand Down Expand Up @@ -415,6 +425,7 @@ int main(int argc, char ** argv) {


llama_free(ctx);
llama_free_model(model);
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
Expand Down
Loading

0 comments on commit 35a6031

Please sign in to comment.